diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 32c64335b..9f609e831 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -63,6 +63,7 @@ import { getProject, loadConfig, loadProjectRegistry, + normalizeCategoryPath, normalizeTraceArtifactToTraceSessionResponse, omitExternalTraceMetadataKeys, readGitResultArtifact, @@ -1883,30 +1884,7 @@ async function handleRunCategories(c: C, { searchDir, agentvDir, projectId }: Da try { const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - const categoryMap = new Map }>(); - for (const r of loaded) { - const cat = r.category ?? DEFAULT_CATEGORY; - const entry = categoryMap.get(cat) ?? { - results: [], - suites: new Set(), - }; - entry.results.push(r); - entry.suites.add(r.suite ?? r.target ?? 'default'); - categoryMap.set(cat, entry); - } - const categories = [...categoryMap.entries()].map(([name, entry]) => { - const qualitySummary = summarizeQualityResults(entry.results, pass_threshold); - return { - name, - total: qualitySummary.totalCount, - passed: qualitySummary.passedCount, - failed: qualitySummary.qualityFailureCount, - avg_score: qualitySummary.avgScore, - execution_error_count: qualitySummary.executionErrorCount, - suite_count: entry.suites.size, - }; - }); - return c.json({ categories }); + return c.json(buildCategoryRollups(loaded, pass_threshold)); } catch { return c.json({ error: 'Failed to load categories' }, 500); } @@ -1920,7 +1898,10 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D try { const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); + const selectedCategory = normalizeCategoryPath(category); + const filtered = loaded.filter((r) => + isCategoryDescendant(categoryPathFromResult(r), selectedCategory), + ); const suiteMap = new Map(); for (const r of filtered) { const ds = r.suite ?? r.target ?? 'default'; @@ -1945,6 +1926,120 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D } } +interface CategoryRollupBucket { + readonly results: EvaluationResult[]; + readonly suites: Set; + readonly children: Set; +} + +interface CategoryRollupSummary { + readonly name: string; + readonly label: string; + readonly parent?: string; + readonly depth: number; + readonly total: number; + readonly passed: number; + readonly failed: number; + readonly avg_score: number; + readonly execution_error_count: number; + readonly suite_count: number; + readonly child_count: number; + readonly children?: CategoryRollupSummary[]; +} + +function categoryPathFromResult(result: EvaluationResult): string { + return normalizeCategoryPath(result.category ?? DEFAULT_CATEGORY); +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + if (parts.length === 0) return [DEFAULT_CATEGORY]; + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function categoryParent(category: string): string | undefined { + const parts = category.split('/'); + return parts.length > 1 ? parts.slice(0, -1).join('/') : undefined; +} + +function categoryLabel(category: string): string { + return category.split('/').at(-1) ?? category; +} + +function isCategoryDescendant(category: string, selectedCategory: string): boolean { + return category === selectedCategory || category.startsWith(`${selectedCategory}/`); +} + +function summarizeCategoryBucket( + name: string, + entry: CategoryRollupBucket, + passThreshold: number, +): CategoryRollupSummary { + const qualitySummary = summarizeQualityResults(entry.results, passThreshold); + const parent = categoryParent(name); + return { + name, + label: categoryLabel(name), + ...(parent && { parent }), + depth: name.split('/').filter(Boolean).length - 1, + total: qualitySummary.totalCount, + passed: qualitySummary.passedCount, + failed: qualitySummary.qualityFailureCount, + avg_score: qualitySummary.avgScore, + execution_error_count: qualitySummary.executionErrorCount, + suite_count: entry.suites.size, + child_count: entry.children.size, + }; +} + +function buildCategoryRollups( + results: readonly EvaluationResult[], + passThreshold: number, +): { categories: CategoryRollupSummary[]; category_tree: CategoryRollupSummary[] } { + const categoryMap = new Map(); + const ensureEntry = (name: string): CategoryRollupBucket => { + const existing = categoryMap.get(name); + if (existing) return existing; + const created = { results: [], suites: new Set(), children: new Set() }; + categoryMap.set(name, created); + return created; + }; + + for (const result of results) { + const category = categoryPathFromResult(result); + const suite = result.suite ?? result.target ?? 'default'; + const prefixes = categoryPrefixes(category); + for (const prefix of prefixes) { + const entry = ensureEntry(prefix); + entry.results.push(result); + entry.suites.add(suite); + } + for (let index = 1; index < prefixes.length; index++) { + ensureEntry(prefixes[index - 1]).children.add(prefixes[index]); + } + } + + const categories = [...categoryMap.entries()] + .map(([name, entry]) => summarizeCategoryBucket(name, entry, passThreshold)) + .sort((a, b) => a.name.localeCompare(b.name)); + + const summariesByName = new Map(categories.map((summary) => [summary.name, summary])); + const buildTreeNode = (summary: CategoryRollupSummary): CategoryRollupSummary => { + const children = [...(categoryMap.get(summary.name)?.children ?? [])] + .map((childName) => summariesByName.get(childName)) + .filter((child): child is CategoryRollupSummary => Boolean(child)) + .sort((a, b) => a.name.localeCompare(b.name)) + .map(buildTreeNode); + return children.length > 0 ? { ...summary, children } : summary; + }; + const categoryTree = categories + .filter((summary) => !summary.parent) + .sort((a, b) => a.name.localeCompare(b.name)) + .map(buildTreeNode); + + return { categories, category_tree: categoryTree }; +} + async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; const evalId = c.req.param('evalId') ?? ''; @@ -2449,7 +2544,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont } entry.tests.push({ test_id: r.testId, - ...(r.category && { category: r.category }), + ...(r.category && { category: normalizeCategoryPath(r.category) }), score: r.score, passed, execution_status: r.executionStatus, @@ -2459,7 +2554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont // Per-run accumulation. Dedupe tests within the run by last-wins. runTestMap.set(r.testId, { test_id: r.testId, - ...(r.category && { category: r.category }), + ...(r.category && { category: normalizeCategoryPath(r.category) }), score: r.score, passed, execution_status: r.executionStatus, diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index fab72ba75..3e0295fcd 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1457,17 +1457,113 @@ describe('serve app', () => { suite_count: number; }>; }; - expect(categoriesData.categories).toEqual([ - { - name: 'runtime', - total: 3, - passed: 1, - failed: 1, - avg_score: 0.75, - execution_error_count: 1, - suite_count: 1, - }, - ]); + expect(categoriesData.categories).toHaveLength(1); + expect(categoriesData.categories[0]).toMatchObject({ + name: 'runtime', + total: 3, + passed: 1, + failed: 1, + avg_score: 0.75, + execution_error_count: 1, + suite_count: 1, + }); + }); + + it('returns hierarchical category rollups and descendant category drilldown', async () => { + const runsDir = localResultsExperimentDir(tempDir); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-30-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + toJsonl( + { + ...RESULT_A, + test_id: 'network-pass', + suite: 'network-suite', + category: 'security/network', + score: 1, + }, + { + ...RESULT_B, + test_id: 'security-fail', + suite: 'root-suite', + category: 'security', + score: 0, + }, + { + ...RESULT_A, + test_id: 'flat-pass', + suite: 'legacy-suite', + category: 'legacy-flat', + score: 1, + }, + ), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const categoriesRes = await app.request(`/api/runs/${filename}/categories`); + expect(categoriesRes.status).toBe(200); + const categoriesData = (await categoriesRes.json()) as { + categories: Array<{ + name: string; + parent?: string; + total: number; + passed: number; + failed: number; + child_count?: number; + }>; + category_tree?: Array<{ name: string; children?: Array<{ name: string }> }>; + }; + + expect(categoriesData.categories).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: 'security', + total: 2, + passed: 1, + failed: 1, + child_count: 1, + }), + expect.objectContaining({ + name: 'security/network', + parent: 'security', + total: 1, + passed: 1, + failed: 0, + }), + expect.objectContaining({ + name: 'legacy-flat', + total: 1, + passed: 1, + failed: 0, + }), + ]), + ); + expect(categoriesData.category_tree).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: 'security', + children: [expect.objectContaining({ name: 'security/network' })], + }), + ]), + ); + + const suitesRes = await app.request( + `/api/runs/${filename}/categories/${encodeURIComponent('security')}/suites`, + ); + expect(suitesRes.status).toBe(200); + const suitesData = (await suitesRes.json()) as { + suites: Array<{ name: string; total: number }>; + }; + expect(suitesData.suites).toEqual( + expect.arrayContaining([ + expect.objectContaining({ name: 'network-suite', total: 1 }), + expect.objectContaining({ name: 'root-suite', total: 1 }), + ]), + ); }); it('infers the experiment name from the run id when live results have not written it yet', async () => { diff --git a/apps/dashboard/src/components/RunDetail.tsx b/apps/dashboard/src/components/RunDetail.tsx index 1f20c2950..944f7a8cb 100644 --- a/apps/dashboard/src/components/RunDetail.tsx +++ b/apps/dashboard/src/components/RunDetail.tsx @@ -22,9 +22,9 @@ import { Link } from '@tanstack/react-router'; import type { EvalResult } from '~/lib/types'; import { useRunLog, useStudioConfig } from '~/lib/api'; +import { type CategoryTreeNode, buildCategoryTree } from '~/lib/category-tree'; import { findPhoenixExternalTraceUrl } from '~/lib/external-trace-link'; import { summarizeQuality } from '~/lib/result-summary'; -import { formatCategoryDisplay } from '~/lib/run-detail-context'; import { PassRatePill } from './PassRatePill'; import { ResultTable } from './ResultTable'; @@ -36,91 +36,21 @@ interface RunDetailProps { projectId?: string; } -interface SuiteStats { - name: string; - passed: number; - failed: number; - executionErrors: number; - total: number; - avgScore: number; -} - -interface CategoryGroup { - name: string; - displayName: string; - mutedDisplayName?: string; - suites: SuiteStats[]; - total: number; - passed: number; - failed: number; - executionErrors: number; - avgScore: number; -} - -function buildCategoryGroups(results: EvalResult[], passThreshold: number): CategoryGroup[] { - const categoryMap = new Map>(); - - for (const r of results) { - const cat = r.category ?? 'Uncategorized'; - const ds = r.suite ?? 'Uncategorized'; - if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); - // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above - const dsMap = categoryMap.get(cat)!; - const entry = dsMap.get(ds) ?? []; - entry.push(r); - dsMap.set(ds, entry); - } - - return Array.from(categoryMap.entries()) - .map(([catName, dsMap]) => { - const suites = Array.from(dsMap.entries()) - .map(([dsName, suiteResults]) => { - const stats = summarizeQuality(suiteResults, passThreshold); - return { - name: dsName, - passed: stats.passed, - failed: stats.failed, - executionErrors: stats.executionErrors, - total: stats.total, - avgScore: stats.avgScore, - }; - }) - .sort((a, b) => a.name.localeCompare(b.name)); - - const total = suites.reduce((s, d) => s + d.total, 0); - const passed = suites.reduce((s, d) => s + d.passed, 0); - const failed = suites.reduce((s, d) => s + d.failed, 0); - const executionErrors = suites.reduce((s, d) => s + d.executionErrors, 0); - const qualityTotal = total - executionErrors; - const scoreSum = suites.reduce((s, d) => s + d.avgScore * (d.total - d.executionErrors), 0); - - const display = formatCategoryDisplay(catName); - - return { - name: catName, - displayName: display.label, - mutedDisplayName: display.mutedLabel, - suites, - total, - passed, - failed, - executionErrors, - avgScore: qualityTotal > 0 ? scoreSum / qualityTotal : 0, - }; - }) - .sort((a, b) => a.name.localeCompare(b.name)); -} - export function RunDetail({ results, runId, projectId }: RunDetailProps) { const { data: config } = useStudioConfig(projectId); const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; + const [expandedCategories, setExpandedCategories] = useState>({}); const phoenixUrl = findPhoenixExternalTraceUrl(results); const total = results.length; const summary = summarizeQuality(results, passThreshold); const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - const categories = buildCategoryGroups(results, passThreshold); + const categoryTree = buildCategoryTree(results, passThreshold); + const visibleCategories = visibleCategoryRows(categoryTree, expandedCategories); + const toggleCategory = (category: string) => { + setExpandedCategories((current) => ({ ...current, [category]: !current[category] })); + }; if (total === 0) { return ( @@ -166,43 +96,59 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { - {categories.map((cat) => { - const label = ( - - {cat.displayName} - {cat.mutedDisplayName ? ( - - {cat.mutedDisplayName} - - ) : null} - - ); - + {visibleCategories.map((cat) => { + const expanded = expandedCategories[cat.name] === true; return ( - {projectId ? ( - - {label} - - ) : ( - - {label} - - )} + + + {cat.childCount > 0 ? ( + + ) : ( + + )} + {projectId ? ( + + {cat.label} + + ) : ( + + {cat.label} + + )} + {cat.depth > 0 ? ( + + {cat.name} + + ) : null} + {cat.childCount > 0 ? ( + + {cat.childCount} + + ) : null} + , +): CategoryTreeNode[] { + return nodes.flatMap((node) => [ + node, + ...(expanded[node.name] ? visibleCategoryRows(node.children, expanded) : []), + ]); +} + function ExternalTraceLink({ href }: { href?: string }) { if (!href) return null; diff --git a/apps/dashboard/src/lib/category-tree.test.ts b/apps/dashboard/src/lib/category-tree.test.ts new file mode 100644 index 000000000..bfd60eb07 --- /dev/null +++ b/apps/dashboard/src/lib/category-tree.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from 'bun:test'; + +import { buildCategoryTree, flattenCategoryTree, normalizeCategoryPath } from './category-tree'; +import type { EvalResult } from './types'; + +function result(overrides: Partial): EvalResult { + return { + testId: overrides.testId ?? 'case', + suite: overrides.suite ?? 'suite', + category: overrides.category, + score: overrides.score ?? 1, + ...overrides, + }; +} + +describe('category tree model', () => { + it('builds parent rollups from slash-delimited category metadata', () => { + const tree = buildCategoryTree( + [ + result({ testId: 'network-pass', category: 'security/network', score: 1 }), + result({ testId: 'security-fail', category: 'security', score: 0 }), + result({ testId: 'quality-pass', category: 'quality/regression', score: 0.9 }), + ], + 0.8, + ); + + const nodes = flattenCategoryTree(tree); + const security = nodes.find((node) => node.name === 'security'); + const network = nodes.find((node) => node.name === 'security/network'); + + expect(tree.map((node) => node.name)).toEqual(['quality', 'security']); + expect(security).toMatchObject({ + name: 'security', + label: 'security', + total: 2, + passed: 1, + failed: 1, + childCount: 1, + }); + expect(network).toMatchObject({ + name: 'security/network', + label: 'network', + parent: 'security', + depth: 1, + total: 1, + passed: 1, + }); + }); + + it('preserves existing flat categories as one-node paths', () => { + const tree = buildCategoryTree( + [result({ testId: 'flat', category: 'Safety > PII', score: 0.5 })], + 0.8, + ); + + expect(tree).toHaveLength(1); + expect(tree[0]).toMatchObject({ + name: 'Safety > PII', + label: 'Safety > PII', + total: 1, + failed: 1, + children: [], + }); + }); + + it('canonicalizes explicit slash category strings', () => { + expect(normalizeCategoryPath(' security / network ')).toBe('security/network'); + expect(normalizeCategoryPath('security\\network')).toBe('security/network'); + }); +}); diff --git a/apps/dashboard/src/lib/category-tree.ts b/apps/dashboard/src/lib/category-tree.ts new file mode 100644 index 000000000..22ede49c4 --- /dev/null +++ b/apps/dashboard/src/lib/category-tree.ts @@ -0,0 +1,134 @@ +import { summarizeQuality } from './result-summary'; +import type { EvalResult } from './types'; + +export const DEFAULT_CATEGORY = 'Uncategorized'; + +export interface CategoryTreeNode { + name: string; + label: string; + parent?: string; + depth: number; + total: number; + passed: number; + failed: number; + executionErrors: number; + avgScore: number; + suiteCount: number; + childCount: number; + children: CategoryTreeNode[]; +} + +interface CategoryBucket { + results: EvalResult[]; + suites: Set; + children: Set; +} + +export function normalizeCategoryPath(category: string | undefined): string { + const normalized = category + ?.replace(/\\/g, '/') + .split('/') + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join('/'); + return normalized && normalized.length > 0 ? normalized : DEFAULT_CATEGORY; +} + +export function buildCategoryTree( + results: readonly EvalResult[], + passThreshold: number, +): CategoryTreeNode[] { + const buckets = new Map(); + const ensureBucket = (name: string): CategoryBucket => { + const existing = buckets.get(name); + if (existing) return existing; + const created = { results: [], suites: new Set(), children: new Set() }; + buckets.set(name, created); + return created; + }; + + for (const result of results) { + const category = normalizeCategoryPath(result.category); + const suite = result.suite ?? 'Uncategorized'; + const prefixes = categoryPrefixes(category); + for (const prefix of prefixes) { + const bucket = ensureBucket(prefix); + bucket.results.push(result); + bucket.suites.add(suite); + } + for (let index = 1; index < prefixes.length; index++) { + ensureBucket(prefixes[index - 1]).children.add(prefixes[index]); + } + } + + const nodeByName = new Map( + [...buckets.entries()].map(([name, bucket]) => [ + name, + summarizeCategoryBucket(name, bucket, passThreshold), + ]), + ); + + return [...nodeByName.values()] + .filter((node) => !node.parent) + .sort(compareCategoryNodes) + .map((node) => attachChildren(node, buckets, nodeByName)); +} + +export function flattenCategoryTree(nodes: readonly CategoryTreeNode[]): CategoryTreeNode[] { + return nodes.flatMap((node) => [node, ...flattenCategoryTree(node.children)]); +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + if (parts.length === 0) return [DEFAULT_CATEGORY]; + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function categoryParent(category: string): string | undefined { + const parts = category.split('/'); + return parts.length > 1 ? parts.slice(0, -1).join('/') : undefined; +} + +function categoryLabel(category: string): string { + return category.split('/').at(-1) ?? category; +} + +function summarizeCategoryBucket( + name: string, + bucket: CategoryBucket, + passThreshold: number, +): CategoryTreeNode { + const summary = summarizeQuality(bucket.results, passThreshold); + const parent = categoryParent(name); + return { + name, + label: categoryLabel(name), + ...(parent && { parent }), + depth: name.split('/').filter(Boolean).length - 1, + total: summary.total, + passed: summary.passed, + failed: summary.failed, + executionErrors: summary.executionErrors, + avgScore: summary.avgScore, + suiteCount: bucket.suites.size, + childCount: bucket.children.size, + children: [], + }; +} + +function attachChildren( + node: CategoryTreeNode, + buckets: Map, + nodeByName: Map, +): CategoryTreeNode { + const children = [...(buckets.get(node.name)?.children ?? [])] + .map((childName) => nodeByName.get(childName)) + .filter((child): child is CategoryTreeNode => Boolean(child)) + .sort(compareCategoryNodes) + .map((child) => attachChildren(child, buckets, nodeByName)); + return { ...node, children }; +} + +function compareCategoryNodes(first: CategoryTreeNode, second: CategoryTreeNode): number { + return first.name.localeCompare(second.name); +} diff --git a/apps/dashboard/src/lib/score-distribution.test.ts b/apps/dashboard/src/lib/score-distribution.test.ts index 6e45f91c6..bba65b3be 100644 --- a/apps/dashboard/src/lib/score-distribution.test.ts +++ b/apps/dashboard/src/lib/score-distribution.test.ts @@ -91,6 +91,67 @@ describe('buildScoreDistributionModel', () => { ]); }); + it('treats parent category filters as descendant rollups from category metadata', () => { + const data = compareFixture(); + if (data.runs) { + data.runs[0].tests = [ + { + test_id: 'network', + category: 'security/network', + score: 0.45, + passed: false, + }, + { + test_id: 'application', + category: 'security/application', + score: 0.85, + passed: true, + }, + ]; + } + + const model = buildScoreDistributionModel(data, filters({ category: 'security' }), NOW); + + expect(model.categoryOptions).toEqual( + expect.arrayContaining([ + { value: 'security', label: 'security', count: 2 }, + { value: 'security/application', label: 'security/application', count: 1 }, + { value: 'security/network', label: 'security/network', count: 1 }, + ]), + ); + expect(model.filteredScores).toBe(2); + }); + + it('does not derive category metadata from eval paths', () => { + const data = { + experiments: ['exp-a'], + targets: ['gpt-4o'], + cells: [ + { + experiment: 'exp-a', + target: 'gpt-4o', + eval_count: 1, + passed_count: 1, + pass_rate: 1, + avg_score: 1, + tests: [ + { + test_id: 'path-only', + eval_path: 'security/network.eval.yaml', + score: 1, + passed: true, + }, + ], + }, + ], + } as unknown as CompareResponse; + + const model = buildScoreDistributionModel(data, filters({ category: 'security' }), NOW); + + expect(model.categoryAvailable).toBe(false); + expect(model.filteredScores).toBe(0); + }); + it('returns empty buckets when no scores match the selected slice', () => { const model = buildScoreDistributionModel( compareFixture(), diff --git a/apps/dashboard/src/lib/score-distribution.ts b/apps/dashboard/src/lib/score-distribution.ts index efd43db3d..dfa2fe1bc 100644 --- a/apps/dashboard/src/lib/score-distribution.ts +++ b/apps/dashboard/src/lib/score-distribution.ts @@ -8,6 +8,7 @@ * metadata field is needed, then filter samples in `buildScoreDistributionModel`. */ +import { normalizeCategoryPath } from './category-tree'; import type { CompareResponse, CompareRunEntry, CompareTestResult } from './types'; export const ALL_DISTRIBUTION_FILTER_VALUE = ''; @@ -68,7 +69,9 @@ export function buildScoreDistributionModel( ): ScoreDistributionModel { const samples = collectScoreSamples(data); const experimentOptions = buildExperimentOptions(data, samples); - const categoryOptions = buildOptions(samples.flatMap((sample) => sample.category ?? [])); + const categoryOptions = buildOptions( + samples.flatMap((sample) => (sample.category ? categoryPrefixes(sample.category) : [])), + ); const categoryAvailable = categoryOptions.length > 0; const hasTimestampedScores = samples.some((sample) => sample.startedAtMs !== undefined); const activePeriod = @@ -79,7 +82,7 @@ export function buildScoreDistributionModel( const filtered = samples.filter((sample) => { if (filters.experiment && sample.experiment !== filters.experiment) return false; - if (filters.category && sample.category !== filters.category) return false; + if (filters.category && !isCategoryDescendant(sample.category, filters.category)) return false; if (windowStartMs !== undefined) { return sample.startedAtMs !== undefined && sample.startedAtMs >= windowStartMs; } @@ -174,7 +177,19 @@ function buildBuckets(scores: number[]): ScoreDistributionBucket[] { function normalizeCategory(value: string | undefined): string | undefined { const trimmed = value?.trim(); - return trimmed ? trimmed : undefined; + return trimmed ? normalizeCategoryPath(trimmed) : undefined; +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function isCategoryDescendant(category: string | undefined, selectedCategory: string): boolean { + return ( + category !== undefined && + (category === selectedCategory || category.startsWith(`${selectedCategory}/`)) + ); } function parseTimestamp(value: string): number | undefined { diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index f928a3ea9..ea5084dea 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -502,16 +502,22 @@ export interface FileContentResponse { export interface CategorySummary { name: string; + label?: string; + parent?: string; + depth?: number; total: number; passed: number; failed: number; avg_score: number; execution_error_count?: number; suite_count: number; + child_count?: number; + children?: CategorySummary[]; } export interface CategoriesResponse { categories: CategorySummary[]; + category_tree?: CategorySummary[]; } export interface StudioConfigResponse { diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 5b91ebd1e..1ffdbb25e 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -109,6 +109,7 @@ tests: |-------|-------------| | `description` | Human-readable description of the evaluation | | `suite` | Optional suite identifier | +| `category` | Optional slash-delimited analytics taxonomy path. Overrides the category derived from the eval file path. | | `experiment` | Runtime policy (`target`, `targets`, `workers`, `repeat`, `threshold`, `timeout_seconds`, `budget_usd`, etc.) | | `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). | | `tests` | Array of individual tests, include entries, or a string path to an external file or directory. Tests and include entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. | @@ -154,6 +155,13 @@ tests: input: Screen "Acme Corp" against denied parties list ``` +When `category` is omitted, AgentV derives it from the eval file path. Generic +filenames do not add a leaf: `security/eval.yaml` becomes `security`, and +`security/network/dataset.eval.yaml` becomes `security/network`. A meaningful +named eval file contributes a leaf, so `security/network.eval.yaml` becomes +`security/network`. Existing flat category strings remain valid one-node +category paths. + ### Suite-level Assertions The `assertions` field is the canonical way to define suite-level graders. Suite-level assertions are appended to every test's graders unless a test sets `execution.skip_defaults: true`. diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts index 7f4a39e5a..e09bfdde0 100644 --- a/packages/core/src/evaluation/category.ts +++ b/packages/core/src/evaluation/category.ts @@ -1,18 +1,52 @@ -/** Default category for eval files without subdirectory structure. */ +/** Default category for eval files without category taxonomy metadata. */ export const DEFAULT_CATEGORY = 'Uncategorized'; +const GENERIC_EVAL_FILE_STEMS = new Set(['eval', 'dataset']); + /** - * Derive a human-readable category from an eval file's relative path. + * Canonicalize analytics category taxonomy paths. * - * Strips the filename and any `evals` directory segments, then joins - * remaining directories with `/`. Returns {@link DEFAULT_CATEGORY} for files - * at the root level. + * Categories are slash-delimited analytics paths, not filesystem paths. Existing + * flat labels remain valid one-node paths, while repeated slash separators and + * surrounding whitespace are normalized for derived and explicit categories. + */ +export function normalizeCategoryPath(category: string | undefined): string { + const normalized = category + ?.replace(/\\/g, '/') + .split('/') + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join('/'); + return normalized && normalized.length > 0 ? normalized : DEFAULT_CATEGORY; +} + +function evalFileStem(fileName: string): string { + return fileName.replace(/\.eval\.[^.]+$/i, '').replace(/\.[^.]+$/i, ''); +} + +/** + * Derive a canonical slash-delimited analytics category path from an eval file. + * + * Generic eval filenames such as `eval.yaml` and `dataset.eval.yaml` do not add + * a taxonomy leaf. Meaningful named eval files such as `network.eval.yaml` do + * contribute a leaf. Any `evals` directory segment is treated as organization + * only and is removed from the analytics taxonomy. */ export function deriveCategory(relativePath: string): string { - const parts = relativePath.split(/[/\\]/); - if (parts.length <= 1) { + const parts = relativePath + .split(/[/\\]/) + .map((part) => part.trim()) + .filter((part) => part.length > 0); + const fileName = parts.at(-1); + if (!fileName) { return DEFAULT_CATEGORY; } - const dirs = parts.slice(0, -1).filter((d) => d !== 'evals'); - return dirs.length > 0 ? dirs.join('/') : DEFAULT_CATEGORY; + + const taxonomyParts = parts.slice(0, -1).filter((part) => part !== 'evals'); + const stem = evalFileStem(fileName).trim(); + if (stem && !GENERIC_EVAL_FILE_STEMS.has(stem.toLowerCase())) { + taxonomyParts.push(stem); + } + + return normalizeCategoryPath(taxonomyParts.join('/')); } diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index df047d03e..4d3faf794 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -4,6 +4,7 @@ import fg from 'fast-glob'; import micromatch from 'micromatch'; import { stringify as stringifyYaml } from 'yaml'; +import { normalizeCategoryPath } from './category.js'; import { type ExperimentConfig, normalizeExperimentConfig, @@ -789,10 +790,12 @@ async function loadTestsFromParsedYamlValue( ? (renderedCase.window_size as number) : undefined; + const category = normalizeCategoryPath(suite.category ?? options?.category); + const testCase: EvalTest = { id, suite: suiteName, - category: suite.category ?? options?.category, + category, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 141ab1d50..072bbded2 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -188,7 +188,7 @@ export { } from './projects.js'; export { syncProject, syncProjects } from './project-sync.js'; export { trimBaselineResult } from './evaluation/baseline.js'; -export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; +export { DEFAULT_CATEGORY, deriveCategory, normalizeCategoryPath } from './evaluation/category.js'; export * from './observability/index.js'; // Registry exports diff --git a/packages/core/test/evaluation/category.test.ts b/packages/core/test/evaluation/category.test.ts index 9b8c62d01..3dc3f70f9 100644 --- a/packages/core/test/evaluation/category.test.ts +++ b/packages/core/test/evaluation/category.test.ts @@ -1,27 +1,35 @@ import { describe, expect, test } from 'bun:test'; -import { DEFAULT_CATEGORY, deriveCategory } from '../../src/evaluation/category.js'; +import { + DEFAULT_CATEGORY, + deriveCategory, + normalizeCategoryPath, +} from '../../src/evaluation/category.js'; describe('deriveCategory', () => { test('returns Uncategorized for single-segment path (root-level file)', () => { expect(deriveCategory('dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); }); + test('uses a meaningful root-level eval filename as a one-node category path', () => { + expect(deriveCategory('network.eval.yaml')).toBe('network'); + }); + test('returns Uncategorized when only directory is evals', () => { expect(deriveCategory('evals/dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); }); - test('strips evals segment and returns remaining directory', () => { - expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals'); + test('strips evals segment and appends meaningful named eval files as a leaf', () => { + expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals/greetings'); }); - test('preserves nested directory paths', () => { + test('does not append generic eval filenames to nested directory paths', () => { expect(deriveCategory('evals/cargowise-customs/layout-engine/eval.yaml')).toBe( 'cargowise-customs/layout-engine', ); }); - test('handles paths without evals segment', () => { + test('handles generic filenames without evals segment', () => { expect(deriveCategory('examples/showcase/eval.yaml')).toBe('examples/showcase'); }); @@ -38,4 +46,27 @@ describe('deriveCategory', () => { test('returns Uncategorized for just a filename with no directory', () => { expect(deriveCategory('eval.yaml')).toBe(DEFAULT_CATEGORY); }); + + test('matches the hierarchical category derivation contract', () => { + expect(deriveCategory('security/eval.yaml')).toBe('security'); + expect(deriveCategory('security/network.eval.yaml')).toBe('security/network'); + expect(deriveCategory('security/network/dataset.eval.yaml')).toBe('security/network'); + }); +}); + +describe('normalizeCategoryPath', () => { + test('canonicalizes explicit slash-delimited taxonomy paths', () => { + expect(normalizeCategoryPath(' security / network ')).toBe('security/network'); + expect(normalizeCategoryPath('security//network')).toBe('security/network'); + expect(normalizeCategoryPath('security\\network')).toBe('security/network'); + }); + + test('preserves existing flat category strings as one-node paths', () => { + expect(normalizeCategoryPath('Safety > PII')).toBe('Safety > PII'); + }); + + test('returns Uncategorized for empty explicit categories', () => { + expect(normalizeCategoryPath(' / ')).toBe(DEFAULT_CATEGORY); + expect(normalizeCategoryPath(undefined)).toBe(DEFAULT_CATEGORY); + }); }); diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts index e9653f61d..772aeed95 100644 --- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts +++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts @@ -74,6 +74,19 @@ tests: expect(suite.metadata).toBeUndefined(); }); + it('uses explicit YAML category as a canonical taxonomy path override', async () => { + const { filePath, dir } = createTempYaml(` +category: " security / network " +tests: + - id: test-1 + input: "Hello" + criteria: "Greet" +`); + + const suite = await loadTestSuite(filePath, dir, { category: 'derived/path' }); + expect(suite.tests[0].category).toBe('security/network'); + }); + it('still loads tests correctly when metadata is present', async () => { const { filePath, dir } = createTempYaml(` name: my-eval