From e5571bafd025b9f7a913652c15266a8ec171f0f1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 29 Jun 2026 06:49:15 +0200 Subject: [PATCH] fix(eval): stop surfacing provider staging logs --- .../cli/src/commands/eval/progress-display.ts | 21 +----------- .../commands/eval/artifact-writer.test.ts | 1 + .../commands/eval/progress-display.test.ts | 20 +++++++++++ packages/core/src/evaluation/run-artifacts.ts | 33 ++++++++++++++++++- .../core/test/evaluation/orchestrator.test.ts | 20 ++++++++--- 5 files changed, 70 insertions(+), 25 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index f5db6a1b2..58805e0bd 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -65,8 +65,6 @@ export class ProgressDisplay { private readonly workers: Map = new Map(); private totalTests = 0; private completedTests = 0; - private readonly logPaths: string[] = []; - private readonly logPathSet = new Set(); private started = false; private finished = false; private readonly verbose: boolean; @@ -133,24 +131,7 @@ export class ProgressDisplay { } addLogPaths(paths: readonly string[]): void { - const newPaths: string[] = []; - for (const path of paths) { - if (this.logPathSet.has(path)) { - continue; - } - this.logPathSet.add(path); - newPaths.push(path); - } - - if (newPaths.length === 0) { - return; - } - - this.logPaths.push(...newPaths); - - for (const p of newPaths) { - console.log(`Provider log: ${p}`); - } + void paths; } finish(): void { diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 16ba3d234..2ba014d14 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1655,6 +1655,7 @@ describe('writeArtifactsFromResults', () => { const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); + await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog); await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow(); const transcriptLines = ( diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts index 5b505791c..9f82b9571 100644 --- a/apps/cli/test/commands/eval/progress-display.test.ts +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -105,4 +105,24 @@ describe('ProgressDisplay', () => { expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']); }); + + it('does not print provider staging log paths', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.addLogPaths([ + '/tmp/agentv-provider-streams/run-001/case/logs/codex/codex-stream.log', + ]); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([]); + }); }); diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 37e73c5e5..27dbd2312 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -8,7 +8,8 @@ */ import { createHash } from 'node:crypto'; -import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'; +import { copyFile, mkdir, readFile, rm, rmdir, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; import path from 'node:path'; import { @@ -1520,6 +1521,35 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined return sourcePath ? sourcePath : undefined; } +function providerStagingRoot(): string { + return path.resolve(tmpdir(), 'agentv-provider-streams'); +} + +function isAgentvProviderStagingPath(filePath: string): boolean { + const root = providerStagingRoot(); + const resolved = path.resolve(filePath); + return resolved.startsWith(`${root}${path.sep}`); +} + +async function cleanupProviderStagingFile(filePath: string): Promise { + if (!isAgentvProviderStagingPath(filePath)) { + return; + } + + await rm(filePath, { force: true }); + + const root = providerStagingRoot(); + let current = path.dirname(path.resolve(filePath)); + while (current !== root && current.startsWith(`${root}${path.sep}`)) { + try { + await rmdir(current); + } catch { + break; + } + current = path.dirname(current); + } +} + interface TraceEnvelopeSidecarParams { readonly result: EvaluationResult; readonly outputDir: string; @@ -1748,6 +1778,7 @@ async function writeRawTranscriptJsonl( const rawSource = rawProviderLogSourcePath(result); if (rawSource) { await copyFile(rawSource, filePath); + await cleanupProviderStagingFile(rawSource).catch(() => undefined); return; } await writeGeneratedRawTranscriptJsonl(filePath, result, envelope); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 755d031fa..9ddaa1c05 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1,5 +1,12 @@ import { afterEach, describe, expect, it, mock } from 'bun:test'; -import { mkdtempSync, readFileSync, readdirSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + writeFileSync, +} from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -723,9 +730,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.failureReasonCode).toBe('provider_error'); }); - it('stores raw provider logs once as transcript-raw evidence', async () => { - const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-')); - const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl'); + it('stores raw provider logs once as transcript-raw evidence and cleans staging files', async () => { + const stagingRoot = path.join(tmpdir(), 'agentv-provider-streams'); + mkdirSync(stagingRoot, { recursive: true }); + const tempDir = mkdtempSync(path.join(stagingRoot, 'raw-provider-log-')); + const rawLogDir = path.join(tempDir, 'suite', 'case-1', 'logs', 'codex'); + mkdirSync(rawLogDir, { recursive: true }); + const rawLogPath = path.join(rawLogDir, 'provider-native-session.jsonl'); writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8'); const provider = new SequenceProvider('mock', { @@ -770,6 +781,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(indexRows[0]?.trace_path).toBeUndefined(); expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/run-1/transcript.jsonl`); expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/run-1/transcript-raw.jsonl`); + expect(existsSync(rawLogPath)).toBe(false); }); it('reports failed progress status for batch item errors', async () => {