From 2007054ef7c2ebd3b226529261ada222f5bbc56c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 27 Jun 2026 11:38:59 +0200 Subject: [PATCH] feat(dashboard): identify results by eval path --- apps/cli/src/commands/results/serve.ts | 1 + apps/cli/test/commands/results/serve.test.ts | 16 ++++- apps/dashboard/src/components/EvalDetail.tsx | 3 +- .../src/components/EvalSourceLabel.tsx | 18 +++++ apps/dashboard/src/components/ResultTable.tsx | 29 +++----- apps/dashboard/src/components/Sidebar.tsx | 68 ++++++++++++++----- apps/dashboard/src/lib/navigation.test.ts | 44 ++++++++++++ apps/dashboard/src/lib/navigation.ts | 52 ++++++++++++++ apps/dashboard/src/lib/result-table.test.ts | 52 ++++++++++++-- apps/dashboard/src/lib/result-table.ts | 33 +++++++-- .../src/lib/run-detail-context.test.ts | 45 ++++++++++++ apps/dashboard/src/lib/run-detail-context.ts | 24 ++++++- apps/dashboard/src/lib/types.ts | 1 + .../src/routes/evals/$runId.$evalId.tsx | 11 ++- .../$projectId_/evals/$runId.$evalId.tsx | 11 ++- 15 files changed, 348 insertions(+), 60 deletions(-) create mode 100644 apps/dashboard/src/components/EvalSourceLabel.tsx diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 445421c4f..014427332 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1203,6 +1203,7 @@ function attachRunDetailReadModelFields>( return { ...result, ...(record.aggregation && { aggregation: record.aggregation }), + ...(record.eval_path && { eval_path: record.eval_path }), ...(record.result_dir && { result_dir: record.result_dir }), ...(record.summary_path && { summary_path: record.summary_path }), ...(record.grading_path && { grading_path: record.grading_path }), diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 95fd1ab48..6d0e5d730 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -2581,18 +2581,30 @@ describe('serve app', () => { const filename = '2026-03-25T10-00-00-000Z'; const runDir = path.join(runsDir, filename); mkdirSync(runDir, { recursive: true }); - writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A, RESULT_B)); + writeFileSync( + path.join(runDir, 'index.jsonl'), + toJsonl( + { + ...RESULT_A, + eval_path: 'evals/demo.eval.yaml', + result_dir: 'demo/test-greeting', + }, + RESULT_B, + ), + ); const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request(`/api/runs/${filename}`); expect(res.status).toBe(200); const data = (await res.json()) as { - results: { testId: string }[]; + results: { testId: string; eval_path?: string; result_dir?: string }[]; source: 'local' | 'remote'; source_label: string; }; expect(data.results).toHaveLength(2); expect(data.results[0].testId).toBe('test-greeting'); + expect(data.results[0].eval_path).toBe('evals/demo.eval.yaml'); + expect(data.results[0].result_dir).toBe('demo/test-greeting'); expect(data.source).toBe('local'); expect(data.source_label).toBe(filename); }); diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index 7fc805106..5447cd996 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -335,8 +335,9 @@ function SourceTab({ result }: { result: EvalResult }) {

Traceability

+ - +
diff --git a/apps/dashboard/src/components/EvalSourceLabel.tsx b/apps/dashboard/src/components/EvalSourceLabel.tsx new file mode 100644 index 000000000..c3fb01641 --- /dev/null +++ b/apps/dashboard/src/components/EvalSourceLabel.tsx @@ -0,0 +1,18 @@ +import { formatEvalSourceDisplay } from '~/lib/run-detail-context'; +import type { EvalResult } from '~/lib/types'; + +interface EvalSourceLabelProps { + result: Pick; + className?: string; +} + +export function EvalSourceLabel({ result, className = '' }: EvalSourceLabelProps) { + const display = formatEvalSourceDisplay(result); + if (!display) return null; + + return ( + + {display.label} + + ); +} diff --git a/apps/dashboard/src/components/ResultTable.tsx b/apps/dashboard/src/components/ResultTable.tsx index 457f1da99..25c8c1cf7 100644 --- a/apps/dashboard/src/components/ResultTable.tsx +++ b/apps/dashboard/src/components/ResultTable.tsx @@ -10,6 +10,7 @@ import type React from 'react'; import { Fragment, useEffect, useMemo, useState } from 'react'; import { useFeedback } from '~/lib/api'; +import { evalResultPath } from '~/lib/navigation'; import { RESULT_TABLE_VIEW_PRESETS, type RepeatRunGroup, @@ -651,8 +652,8 @@ function TrialResultCell({ return ; case 'score': return ; - case 'suite': - return ; + case 'eval': + return ; case 'category': return ; case 'duration': @@ -833,8 +834,8 @@ function ResultCell({ ) : ( ); - case 'suite': - return ; + case 'eval': + return ; case 'category': return ; case 'duration': @@ -910,11 +911,10 @@ function ResultDetailPanel({ onOpenTrialDetail: (trial: EvalCaseTrial, initialTab?: DetailTab) => void; onClose: () => void; }) { - const evalDetailHref = buildEvalDetailHref({ + const evalDetailHref = evalResultPath(runId, row.testId, { projectId, - runId, - evalId: row.testId, resultDir: row.result.result_dir, + evalPath: row.result.eval_path, }); const title = selectedTrialPath ? `${row.testId} · ${selectedTrialPath}` : row.testId; const showAggregateRepeatDetail = repeatGroup && !selectedTrial; @@ -934,7 +934,7 @@ function ResultDetailPanel({

{row.targetLabel} - {row.suiteLabel ? ` · ${row.suiteLabel}` : ''} + {row.evalLabel ? ` · ${row.evalLabel}` : ''}

@@ -1003,19 +1003,6 @@ function ExpanderCell({ ); } -function buildEvalDetailHref(options: { - projectId?: string; - runId: string; - evalId: string; - resultDir?: string; -}): string { - const base = options.projectId - ? `/projects/${encodeURIComponent(options.projectId)}/evals/${encodeURIComponent(options.runId)}/${encodeURIComponent(options.evalId)}` - : `/evals/${encodeURIComponent(options.runId)}/${encodeURIComponent(options.evalId)}`; - if (!options.resultDir) return base; - return `${base}?result_dir=${encodeURIComponent(options.resultDir)}`; -} - function scrollPanelIntoView(panel: HTMLElement | null) { if (!panel) return; window.requestAnimationFrame(() => { diff --git a/apps/dashboard/src/components/Sidebar.tsx b/apps/dashboard/src/components/Sidebar.tsx index 5a3aad362..543e442dc 100644 --- a/apps/dashboard/src/components/Sidebar.tsx +++ b/apps/dashboard/src/components/Sidebar.tsx @@ -33,13 +33,18 @@ import { useRunList, useStudioConfig, } from '~/lib/api'; -import { shouldShowSuiteLabels } from '~/lib/run-detail-context'; +import { + evalResultIdentityKey, + evalResultSearchParams, + matchesEvalResultIdentity, +} from '~/lib/navigation'; +import { shouldShowEvalSourceLabels } from '~/lib/run-detail-context'; import { formatRunDisplay } from '~/lib/run-label'; import { useSidebarContext } from '~/lib/sidebar-context'; import type { EvalResult } from '~/lib/types'; import { BrandName } from './BrandName'; -import { EvalSuiteLabel } from './EvalSuiteLabel'; +import { EvalSourceLabel } from './EvalSourceLabel'; /** Responsive
{data?.results.map((result) => { - const isActive = result.testId === currentEvalId; + const search = evalResultSearchParams({ + resultDir: result.result_dir, + evalPath: result.eval_path, + }); + const isActive = matchesEvalResultIdentity(result, currentEvalId, currentIdentity); return ( ); @@ -490,12 +510,17 @@ function SuiteSidebar({ runId, suite }: { runId: string; suite: string }) { {suiteResults.map((result) => { const passed = isPassing(result.score, passThreshold); + const search = evalResultSearchParams({ + resultDir: result.result_dir, + evalPath: result.eval_path, + }); return ( @@ -609,8 +634,9 @@ function ProjectEvalSidebar({ }) { const { data } = useProjectRunDetail(projectId, runId); const { data: config } = useStudioConfig(projectId); + const currentIdentity = useCurrentEvalIdentitySearch(); const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; - const showSuiteLabels = shouldShowSuiteLabels(data?.results ?? []); + const showEvalSourceLabels = shouldShowEvalSourceLabels(data?.results ?? []); return ( @@ -632,12 +658,17 @@ function ProjectEvalSidebar({ Evaluations {data?.results.map((result) => { - const isActive = result.testId === currentEvalId; + const search = evalResultSearchParams({ + resultDir: result.result_dir, + evalPath: result.eval_path, + }); + const isActive = matchesEvalResultIdentity(result, currentEvalId, currentIdentity); return ( ); @@ -693,11 +724,16 @@ function ProjectSuiteSidebar({ {suiteResults.map((result) => { const passed = isPassing(result.score, passThreshold); + const search = evalResultSearchParams({ + resultDir: result.result_dir, + evalPath: result.eval_path, + }); return ( diff --git a/apps/dashboard/src/lib/navigation.test.ts b/apps/dashboard/src/lib/navigation.test.ts index d47c56388..73dbdf4d5 100644 --- a/apps/dashboard/src/lib/navigation.test.ts +++ b/apps/dashboard/src/lib/navigation.test.ts @@ -3,9 +3,13 @@ import { describe, expect, it } from 'bun:test'; import { categoryPath, evalPath, + evalResultIdentityKey, + evalResultPath, + evalResultSearchParams, experimentPath, initialProjectRedirectStorageKey, jobPath, + matchesEvalResultIdentity, resolveIndexRoute, resolveInitialProjectRedirect, runPath, @@ -58,6 +62,15 @@ describe('route path helpers', () => { expect(evalPath('run::1', 'case/a', 'demo project')).toBe( '/projects/demo%20project/evals/run%3A%3A1/case%2Fa', ); + expect( + evalResultPath('run::1', 'case/a', { + projectId: 'demo project', + resultDir: 'evals/auth.eval.yaml/case-a', + evalPath: 'evals/auth.eval.yaml', + }), + ).toBe( + '/projects/demo%20project/evals/run%3A%3A1/case%2Fa?result_dir=evals%2Fauth.eval.yaml%2Fcase-a', + ); expect(jobPath('job/1', 'demo project')).toBe('/projects/demo%20project/jobs/job%2F1'); expect(categoryPath('run::1', 'Safety > PII', 'demo project')).toBe( '/projects/demo%20project/runs/run%3A%3A1/category/Safety%20%3E%20PII', @@ -74,6 +87,9 @@ describe('route path helpers', () => { it('keeps unscoped paths for legacy single-project routes', () => { expect(runPath('run::1')).toBe('/runs/run%3A%3A1'); expect(evalPath('run::1', 'case/a')).toBe('/evals/run%3A%3A1/case%2Fa'); + expect(evalResultPath('run::1', 'case/a', { evalPath: 'evals/smoke.eval.yaml' })).toBe( + '/evals/run%3A%3A1/case%2Fa?eval_path=evals%2Fsmoke.eval.yaml', + ); expect(jobPath('job/1')).toBe('/jobs/job%2F1'); expect(categoryPath('run::1', 'Safety')).toBe('/runs/run%3A%3A1/category/Safety'); expect(suitePath('run::1', 'evals/smoke.eval.yaml')).toBe( @@ -81,4 +97,32 @@ describe('route path helpers', () => { ); expect(runsHomePath()).toBe('/?tab=runs'); }); + + it('prefers result_dir over eval_path for eval result query identity', () => { + expect( + evalResultSearchParams({ + resultDir: 'opaque/case', + evalPath: 'evals/smoke.eval.yaml', + }), + ).toEqual({ result_dir: 'opaque/case' }); + expect(evalResultSearchParams({ evalPath: 'evals/smoke.eval.yaml' })).toEqual({ + eval_path: 'evals/smoke.eval.yaml', + }); + }); + + it('matches and keys eval results by result_dir before eval_path', () => { + const result = { + testId: 'shared', + target: 'codex', + result_dir: 'opaque/shared', + eval_path: 'evals/auth.eval.yaml', + }; + + expect(evalResultIdentityKey(result)).toBe('opaque/shared'); + expect(matchesEvalResultIdentity(result, 'shared', { resultDir: 'opaque/shared' })).toBe(true); + expect(matchesEvalResultIdentity(result, 'shared', { resultDir: 'other/shared' })).toBe(false); + expect(matchesEvalResultIdentity(result, 'shared', { evalPath: 'evals/auth.eval.yaml' })).toBe( + true, + ); + }); }); diff --git a/apps/dashboard/src/lib/navigation.ts b/apps/dashboard/src/lib/navigation.ts index 6496dcdec..d8bb1488e 100644 --- a/apps/dashboard/src/lib/navigation.ts +++ b/apps/dashboard/src/lib/navigation.ts @@ -33,6 +33,58 @@ export function evalPath(runId: string, evalId: string, projectId?: string): str : `/evals/${encodeURIComponent(runId)}/${encodeURIComponent(evalId)}`; } +export interface EvalResultPathOptions { + projectId?: string; + resultDir?: string; + evalPath?: string; +} + +export interface EvalResultIdentity { + testId: string; + target?: string; + result_dir?: string; + eval_path?: string; + suite?: string; +} + +export function evalResultSearchParams(options: EvalResultPathOptions): Record { + if (options.resultDir) { + return { result_dir: options.resultDir }; + } + if (options.evalPath) { + return { eval_path: options.evalPath }; + } + return {}; +} + +export function evalResultPath( + runId: string, + evalId: string, + options: EvalResultPathOptions = {}, +): string { + const base = evalPath(runId, evalId, options.projectId); + const params = new URLSearchParams(evalResultSearchParams(options)); + const query = params.toString(); + return query ? `${base}?${query}` : base; +} + +export function evalResultIdentityKey(result: EvalResultIdentity): string { + if (result.result_dir) return result.result_dir; + return [result.eval_path ?? result.suite ?? '', result.testId, result.target ?? ''].join(':'); +} + +export function matchesEvalResultIdentity( + result: Pick, + evalId: string, + options: Pick = {}, +): boolean { + return ( + result.testId === evalId && + (!options.resultDir || result.result_dir === options.resultDir) && + (!options.evalPath || result.eval_path === options.evalPath) + ); +} + export function experimentPath(experimentName: string, projectId?: string): string { return projectId ? `/projects/${encodeURIComponent(projectId)}/experiments/${encodeURIComponent(experimentName)}` diff --git a/apps/dashboard/src/lib/result-table.test.ts b/apps/dashboard/src/lib/result-table.test.ts index 6c398fecc..9e50afb1f 100644 --- a/apps/dashboard/src/lib/result-table.test.ts +++ b/apps/dashboard/src/lib/result-table.test.ts @@ -90,7 +90,7 @@ describe('result-table model', () => { results: [ result({ testId: 'metric-case', - suite: 'dataset.eval.yaml', + eval_path: 'evals/dataset.eval.yaml', category: 'smoke', target: 'azure', durationMs: 1234, @@ -105,7 +105,7 @@ describe('result-table model', () => { 'status', 'test', 'target', - 'suite', + 'eval', 'score', 'category', 'duration', @@ -114,15 +114,17 @@ describe('result-table model', () => { 'grader:correctness', ]); expect(model.visibleColumns.map((column) => column.id)).toContain('grader:correctness'); + expect(model.columns.find((column) => column.id === 'eval')?.label).toBe('Eval'); + expect(model.rows[0].evalLabel).toBe('evals/dataset.eval.yaml'); }); - it('orders repeat-run columns with target before suite before score', () => { + it('orders repeat-run columns with target before eval before score', () => { const model = buildResultTableModel({ passThreshold: 0.8, results: [ result({ testId: 'repeat-case', - suite: 'strict-layout', + eval_path: 'evals/strict-layout.eval.yaml', target: 'openai', trials: [ { attempt: 0, run_path: 'run-1', score: 1, verdict: 'pass' }, @@ -137,7 +139,7 @@ describe('result-table model', () => { 'expander', 'test', 'target', - 'suite', + 'eval', 'score', ]); expect(model.repeatGroups).toHaveLength(1); @@ -168,4 +170,44 @@ describe('result-table model', () => { expect(model.state.grader).toBe('rubric'); expect(model.visibleColumns.map((column) => column.id)).toEqual(['grader:rubric']); }); + + it('uses eval_path and result_dir to distinguish duplicate test IDs', () => { + const model = buildResultTableModel({ + passThreshold: 0.8, + results: [ + result({ + testId: 'shared-case', + eval_path: 'evals/auth/login.eval.yaml', + result_dir: 'auth-login/shared-case', + target: 'codex', + }), + result({ + testId: 'shared-case', + eval_path: 'evals/billing/login.eval.yaml', + result_dir: 'billing-login/shared-case', + target: 'codex', + }), + ], + }); + + expect(model.rows.map((row) => row.evalLabel)).toEqual([ + 'evals/auth/login.eval.yaml', + 'evals/billing/login.eval.yaml', + ]); + expect(model.rows.map((row) => row.key)).toEqual([ + 'result_dir:auth-login/shared-case', + 'result_dir:billing-login/shared-case', + ]); + expect(new Set(model.rows.map((row) => row.key)).size).toBe(2); + }); + + it('falls back to legacy suite labels for old runs without eval_path', () => { + const model = buildResultTableModel({ + passThreshold: 0.8, + results: [result({ testId: 'legacy-case', suite: 'legacy-suite' })], + }); + + expect(model.columns.find((column) => column.id === 'eval')?.label).toBe('Eval'); + expect(model.rows[0].evalLabel).toBe('legacy-suite'); + }); }); diff --git a/apps/dashboard/src/lib/result-table.ts b/apps/dashboard/src/lib/result-table.ts index 6587b04f5..ee84e1bde 100644 --- a/apps/dashboard/src/lib/result-table.ts +++ b/apps/dashboard/src/lib/result-table.ts @@ -68,7 +68,7 @@ export interface ResultTableRow { readonly reviewed: boolean; readonly targetLabel: string; readonly modelLabel?: string; - readonly suiteLabel?: string; + readonly evalLabel?: string; readonly categoryLabel?: string; readonly tokenTotal?: number; readonly graderNames: readonly string[]; @@ -235,6 +235,25 @@ function targetLabel(result: EvalResult): string { return targetUsed && targetUsed !== target ? `${target} -> ${targetUsed}` : target; } +function evalLabel(result: EvalResult): string | undefined { + return cleanString(result.eval_path) ?? cleanString(result.suite); +} + +function rowKey(result: EvalResult, index: number): string { + const resultDir = cleanString(result.result_dir); + if (resultDir) return `result_dir:${resultDir}`; + + return [ + 'result', + cleanString(result.eval_path) ?? cleanString(result.suite) ?? '', + result.testId, + cleanString(result.target) ?? '', + cleanString(result.targetUsed) ?? '', + cleanString(result.timestamp) ?? '', + String(index), + ].join(':'); +} + function buildRow( result: EvalResult, index: number, @@ -250,6 +269,7 @@ function buildRow( result.scores?.some((score) => scoreHasFailure(score, passThreshold)) ?? false; const model = modelLabel(result); const target = targetLabel(result); + const evalSource = evalLabel(result); const suite = cleanString(result.suite); const category = cleanString(result.category); const tokenTotal = totalTokens(result); @@ -257,6 +277,7 @@ function buildRow( result.testId, target, model ?? '', + evalSource ?? '', suite ?? '', category ?? '', result.executionStatus ?? '', @@ -266,7 +287,7 @@ function buildRow( ]; return { - key: `${result.testId}:${result.target ?? ''}:${result.timestamp ?? ''}:${index}`, + key: rowKey(result, index), result, index, testId: result.testId, @@ -278,7 +299,7 @@ function buildRow( reviewed: reviewedTestIds.has(result.testId), targetLabel: target, ...(model && { modelLabel: model }), - ...(suite && { suiteLabel: suite }), + ...(evalSource && { evalLabel: evalSource }), ...(category && { categoryLabel: category }), ...(tokenTotal !== undefined && { tokenTotal }), graderNames, @@ -329,7 +350,7 @@ function buildRepeatGroup(row: ResultTableRow, passThreshold: number): RepeatRun function buildColumns(rows: readonly ResultTableRow[], graderOptions: readonly string[]) { const hasRepeatRows = rows.some((row) => caseTrials(row.result).length > 1); - const hasSuite = rows.some((row) => row.suiteLabel); + const hasEval = rows.some((row) => row.evalLabel); const hasCategory = rows.some((row) => row.categoryLabel); const hasDuration = rows.some( (row) => @@ -354,8 +375,8 @@ function buildColumns(rows: readonly ResultTableRow[], graderOptions: readonly s : []), { id: 'test', label: 'Test ID', kind: 'base', defaultVisible: true }, { id: 'target', label: 'Target', kind: 'base', defaultVisible: true }, - ...(hasSuite - ? [{ id: 'suite', label: 'Suite', kind: 'base' as const, defaultVisible: true }] + ...(hasEval + ? [{ id: 'eval', label: 'Eval', kind: 'base' as const, defaultVisible: true }] : []), { id: 'score', label: 'Score', kind: 'base', defaultVisible: true }, ...(hasCategory diff --git a/apps/dashboard/src/lib/run-detail-context.test.ts b/apps/dashboard/src/lib/run-detail-context.test.ts index cc54b7261..aae745b23 100644 --- a/apps/dashboard/src/lib/run-detail-context.test.ts +++ b/apps/dashboard/src/lib/run-detail-context.test.ts @@ -4,8 +4,11 @@ import type { EvalResult } from './types'; import { buildRunDetailHeader, + evalSourceValue, formatCategoryDisplay, + formatEvalSourceDisplay, formatSuiteDisplay, + shouldShowEvalSourceLabels, shouldShowSuiteLabels, } from './run-detail-context'; @@ -97,6 +100,28 @@ describe('formatSuiteDisplay', () => { }); }); +describe('eval source labels', () => { + it('prefers eval_path over legacy suite metadata', () => { + const result = { + eval_path: 'evals/auth/login.eval.yaml', + suite: 'legacy-suite', + }; + + expect(evalSourceValue(result)).toBe('evals/auth/login.eval.yaml'); + expect(formatEvalSourceDisplay(result)).toEqual({ + label: 'login', + title: 'evals/auth/login.eval.yaml', + }); + }); + + it('falls back to suite for old result rows', () => { + expect(formatEvalSourceDisplay({ suite: 'legacy-suite' })).toEqual({ + label: 'legacy-suite', + title: 'legacy-suite', + }); + }); +}); + describe('shouldShowSuiteLabels', () => { it('shows labels for mixed-suite runs', () => { expect( @@ -110,3 +135,23 @@ describe('shouldShowSuiteLabels', () => { ).toBe(false); }); }); + +describe('shouldShowEvalSourceLabels', () => { + it('shows labels for mixed eval paths even when test IDs overlap', () => { + expect( + shouldShowEvalSourceLabels([ + { eval_path: 'evals/a.eval.yaml', suite: 'legacy' }, + { eval_path: 'evals/b.eval.yaml', suite: 'legacy' }, + ]), + ).toBe(true); + }); + + it('suppresses repeated labels for a single eval path', () => { + expect( + shouldShowEvalSourceLabels([ + { eval_path: 'evals/a.eval.yaml' }, + { eval_path: 'evals/a.eval.yaml' }, + ]), + ).toBe(false); + }); +}); diff --git a/apps/dashboard/src/lib/run-detail-context.ts b/apps/dashboard/src/lib/run-detail-context.ts index e88764a75..378c07cc6 100644 --- a/apps/dashboard/src/lib/run-detail-context.ts +++ b/apps/dashboard/src/lib/run-detail-context.ts @@ -6,9 +6,9 @@ * presentation logic here so route components stay thin and tests can pin * the remote-context contract without rendering React. * - * Suite labels are displayed only when a run mixes suites or has partial suite - * metadata. Keep the table/sidebar dense by suppressing repeated labels for - * single-suite runs. + * Eval source labels are displayed only when a run mixes eval files or has + * partial legacy suite metadata. Keep the table/sidebar dense by suppressing + * repeated labels for single-eval runs. */ import type { EvalResult, RunDetailResponse } from './types'; @@ -17,6 +17,7 @@ type RunSource = RunDetailResponse['source']; type HeaderResult = Pick; type SuiteLabelResult = Pick; +type EvalSourceLabelResult = Pick; export interface RunDetailHeaderInput { runId: string; @@ -162,9 +163,26 @@ export function formatSuiteDisplay(suite: string | undefined): SuiteDisplay | un }; } +export function evalSourceValue(result: EvalSourceLabelResult): string | undefined { + return cleanOptional(result.eval_path) ?? cleanOptional(result.suite); +} + +export function formatEvalSourceDisplay(result: EvalSourceLabelResult): SuiteDisplay | undefined { + return formatSuiteDisplay(evalSourceValue(result)); +} + export function shouldShowSuiteLabels(results: readonly SuiteLabelResult[]): boolean { const normalizedSuites = results.map((result) => cleanOptional(result.suite) ?? ''); const meaningfulSuites = normalizedSuites.filter((suite) => suite && suite !== 'Uncategorized'); return meaningfulSuites.length > 0 && new Set(normalizedSuites).size > 1; } + +export function shouldShowEvalSourceLabels(results: readonly EvalSourceLabelResult[]): boolean { + const normalizedSources = results.map((result) => evalSourceValue(result) ?? ''); + const meaningfulSources = normalizedSources.filter( + (source) => source && source !== 'Uncategorized', + ); + + return meaningfulSources.length > 0 && new Set(normalizedSources).size > 1; +} diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 58ba2a89d..835504ce3 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -226,6 +226,7 @@ export type TraceSessionResponse = CoreTraceSessionResponse; export interface EvalResult { testId: string; timestamp?: string; + eval_path?: string; suite?: string; category?: string; target?: string; diff --git a/apps/dashboard/src/routes/evals/$runId.$evalId.tsx b/apps/dashboard/src/routes/evals/$runId.$evalId.tsx index 18462ddcd..54695dceb 100644 --- a/apps/dashboard/src/routes/evals/$runId.$evalId.tsx +++ b/apps/dashboard/src/routes/evals/$runId.$evalId.tsx @@ -12,6 +12,7 @@ import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; import { isPassing, useRunDetail, useStudioConfig } from '~/lib/api'; +import { matchesEvalResultIdentity } from '~/lib/navigation'; export const Route = createFileRoute('/evals/$runId/$evalId')({ component: EvalDetailPage, @@ -23,6 +24,10 @@ function EvalDetailPage() { typeof window === 'undefined' ? undefined : (new URLSearchParams(window.location.search).get('result_dir') ?? undefined); + const evalPath = + typeof window === 'undefined' + ? undefined + : (new URLSearchParams(window.location.search).get('eval_path') ?? undefined); const { data, isLoading, error } = useRunDetail(runId); const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); @@ -45,8 +50,8 @@ function EvalDetailPage() { ); } - const result = data?.results.find( - (r) => r.testId === evalId && (!resultDir || r.result_dir === resultDir), + const result = data?.results.find((r) => + matchesEvalResultIdentity(r, evalId, { resultDir, evalPath }), ); if (!result) { @@ -71,7 +76,7 @@ function EvalDetailPage() {

- Run: {runId} / Eval: {evalId} + Run: {runId} / Eval: {result.eval_path ?? evalId}

r.testId === evalId && (!resultDir || r.result_dir === resultDir), + const result = data?.results.find((r) => + matchesEvalResultIdentity(r, evalId, { resultDir, evalPath }), ); if (!result) { @@ -67,7 +72,7 @@ function ProjectEvalDetailPage() {

- Run: {runId} / Eval: {evalId} + Run: {runId} / Eval: {result.eval_path ?? evalId}