Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 1 addition & 20 deletions apps/cli/src/commands/eval/progress-display.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ export class ProgressDisplay {
private readonly workers: Map<number, WorkerProgress> = new Map();
private totalTests = 0;
private completedTests = 0;
private readonly logPaths: string[] = [];
private readonly logPathSet = new Set<string>();
private started = false;
private finished = false;
private readonly verbose: boolean;
Expand Down Expand Up @@ -133,24 +131,7 @@ export class ProgressDisplay {
}

addLogPaths(paths: readonly string[]): void {
const newPaths: string[] = [];
for (const path of paths) {
if (this.logPathSet.has(path)) {
continue;
}
this.logPathSet.add(path);
newPaths.push(path);
}

if (newPaths.length === 0) {
return;
}

this.logPaths.push(...newPaths);

for (const p of newPaths) {
console.log(`Provider log: ${p}`);
}
void paths;
}

finish(): void {
Expand Down
1 change: 1 addition & 0 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1655,6 +1655,7 @@ describe('writeArtifactsFromResults', () => {

const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl');
await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog);
await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog);
await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow();

const transcriptLines = (
Expand Down
20 changes: 20 additions & 0 deletions apps/cli/test/commands/eval/progress-display.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,24 @@ describe('ProgressDisplay', () => {

expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']);
});

it('does not print provider staging log paths', () => {
const display = new ProgressDisplay(1);
const logs: string[] = [];
const logSpy = mock((message?: unknown) => {
logs.push(String(message ?? ''));
});
const originalLog = console.log;
console.log = logSpy as typeof console.log;

try {
display.addLogPaths([
'/tmp/agentv-provider-streams/run-001/case/logs/codex/codex-stream.log',
]);
} finally {
console.log = originalLog;
}

expect(logs).toEqual([]);
});
});
33 changes: 32 additions & 1 deletion packages/core/src/evaluation/run-artifacts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
*/

import { createHash } from 'node:crypto';
import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises';
import { copyFile, mkdir, readFile, rm, rmdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';

import {
Expand Down Expand Up @@ -1520,6 +1521,35 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined
return sourcePath ? sourcePath : undefined;
}

function providerStagingRoot(): string {
return path.resolve(tmpdir(), 'agentv-provider-streams');
}

function isAgentvProviderStagingPath(filePath: string): boolean {
const root = providerStagingRoot();
const resolved = path.resolve(filePath);
return resolved.startsWith(`${root}${path.sep}`);
}

async function cleanupProviderStagingFile(filePath: string): Promise<void> {
if (!isAgentvProviderStagingPath(filePath)) {
return;
}

await rm(filePath, { force: true });

const root = providerStagingRoot();
let current = path.dirname(path.resolve(filePath));
while (current !== root && current.startsWith(`${root}${path.sep}`)) {
try {
await rmdir(current);
} catch {
break;
}
current = path.dirname(current);
}
}

interface TraceEnvelopeSidecarParams {
readonly result: EvaluationResult;
readonly outputDir: string;
Expand Down Expand Up @@ -1748,6 +1778,7 @@ async function writeRawTranscriptJsonl(
const rawSource = rawProviderLogSourcePath(result);
if (rawSource) {
await copyFile(rawSource, filePath);
await cleanupProviderStagingFile(rawSource).catch(() => undefined);
return;
}
await writeGeneratedRawTranscriptJsonl(filePath, result, envelope);
Expand Down
20 changes: 16 additions & 4 deletions packages/core/test/evaluation/orchestrator.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import { afterEach, describe, expect, it, mock } from 'bun:test';
import { mkdtempSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
import {
existsSync,
mkdirSync,
mkdtempSync,
readFileSync,
readdirSync,
writeFileSync,
} from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';

Expand Down Expand Up @@ -723,9 +730,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
expect(result.failureReasonCode).toBe('provider_error');
});

it('stores raw provider logs once as transcript-raw evidence', async () => {
const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-'));
const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl');
it('stores raw provider logs once as transcript-raw evidence and cleans staging files', async () => {
const stagingRoot = path.join(tmpdir(), 'agentv-provider-streams');
mkdirSync(stagingRoot, { recursive: true });
const tempDir = mkdtempSync(path.join(stagingRoot, 'raw-provider-log-'));
const rawLogDir = path.join(tempDir, 'suite', 'case-1', 'logs', 'codex');
mkdirSync(rawLogDir, { recursive: true });
const rawLogPath = path.join(rawLogDir, 'provider-native-session.jsonl');
writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8');

const provider = new SequenceProvider('mock', {
Expand Down Expand Up @@ -770,6 +781,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
expect(indexRows[0]?.trace_path).toBeUndefined();
expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/run-1/transcript.jsonl`);
expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/run-1/transcript-raw.jsonl`);
expect(existsSync(rawLogPath)).toBe(false);
});

it('reports failed progress status for batch item errors', async () => {
Expand Down