Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 122 additions & 27 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import {
getProject,
loadConfig,
loadProjectRegistry,
normalizeCategoryPath,
normalizeTraceArtifactToTraceSessionResponse,
omitExternalTraceMetadataKeys,
readGitResultArtifact,
Expand Down Expand Up @@ -1883,30 +1884,7 @@ async function handleRunCategories(c: C, { searchDir, agentvDir, projectId }: Da
try {
const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const categoryMap = new Map<string, { results: EvaluationResult[]; suites: Set<string> }>();
for (const r of loaded) {
const cat = r.category ?? DEFAULT_CATEGORY;
const entry = categoryMap.get(cat) ?? {
results: [],
suites: new Set<string>(),
};
entry.results.push(r);
entry.suites.add(r.suite ?? r.target ?? 'default');
categoryMap.set(cat, entry);
}
const categories = [...categoryMap.entries()].map(([name, entry]) => {
const qualitySummary = summarizeQualityResults(entry.results, pass_threshold);
return {
name,
total: qualitySummary.totalCount,
passed: qualitySummary.passedCount,
failed: qualitySummary.qualityFailureCount,
avg_score: qualitySummary.avgScore,
execution_error_count: qualitySummary.executionErrorCount,
suite_count: entry.suites.size,
};
});
return c.json({ categories });
return c.json(buildCategoryRollups(loaded, pass_threshold));
} catch {
return c.json({ error: 'Failed to load categories' }, 500);
}
Expand All @@ -1920,7 +1898,10 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D
try {
const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
const selectedCategory = normalizeCategoryPath(category);
const filtered = loaded.filter((r) =>
isCategoryDescendant(categoryPathFromResult(r), selectedCategory),
);
const suiteMap = new Map<string, EvaluationResult[]>();
for (const r of filtered) {
const ds = r.suite ?? r.target ?? 'default';
Expand All @@ -1945,6 +1926,120 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D
}
}

interface CategoryRollupBucket {
readonly results: EvaluationResult[];
readonly suites: Set<string>;
readonly children: Set<string>;
}

interface CategoryRollupSummary {
readonly name: string;
readonly label: string;
readonly parent?: string;
readonly depth: number;
readonly total: number;
readonly passed: number;
readonly failed: number;
readonly avg_score: number;
readonly execution_error_count: number;
readonly suite_count: number;
readonly child_count: number;
readonly children?: CategoryRollupSummary[];
}

function categoryPathFromResult(result: EvaluationResult): string {
return normalizeCategoryPath(result.category ?? DEFAULT_CATEGORY);
}

function categoryPrefixes(category: string): string[] {
const parts = category.split('/').filter((part) => part.length > 0);
if (parts.length === 0) return [DEFAULT_CATEGORY];
return parts.map((_, index) => parts.slice(0, index + 1).join('/'));
}

function categoryParent(category: string): string | undefined {
const parts = category.split('/');
return parts.length > 1 ? parts.slice(0, -1).join('/') : undefined;
}

function categoryLabel(category: string): string {
return category.split('/').at(-1) ?? category;
}

function isCategoryDescendant(category: string, selectedCategory: string): boolean {
return category === selectedCategory || category.startsWith(`${selectedCategory}/`);
}

function summarizeCategoryBucket(
name: string,
entry: CategoryRollupBucket,
passThreshold: number,
): CategoryRollupSummary {
const qualitySummary = summarizeQualityResults(entry.results, passThreshold);
const parent = categoryParent(name);
return {
name,
label: categoryLabel(name),
...(parent && { parent }),
depth: name.split('/').filter(Boolean).length - 1,
total: qualitySummary.totalCount,
passed: qualitySummary.passedCount,
failed: qualitySummary.qualityFailureCount,
avg_score: qualitySummary.avgScore,
execution_error_count: qualitySummary.executionErrorCount,
suite_count: entry.suites.size,
child_count: entry.children.size,
};
}

function buildCategoryRollups(
results: readonly EvaluationResult[],
passThreshold: number,
): { categories: CategoryRollupSummary[]; category_tree: CategoryRollupSummary[] } {
const categoryMap = new Map<string, CategoryRollupBucket>();
const ensureEntry = (name: string): CategoryRollupBucket => {
const existing = categoryMap.get(name);
if (existing) return existing;
const created = { results: [], suites: new Set<string>(), children: new Set<string>() };
categoryMap.set(name, created);
return created;
};

for (const result of results) {
const category = categoryPathFromResult(result);
const suite = result.suite ?? result.target ?? 'default';
const prefixes = categoryPrefixes(category);
for (const prefix of prefixes) {
const entry = ensureEntry(prefix);
entry.results.push(result);
entry.suites.add(suite);
}
for (let index = 1; index < prefixes.length; index++) {
ensureEntry(prefixes[index - 1]).children.add(prefixes[index]);
}
}

const categories = [...categoryMap.entries()]
.map(([name, entry]) => summarizeCategoryBucket(name, entry, passThreshold))
.sort((a, b) => a.name.localeCompare(b.name));

const summariesByName = new Map(categories.map((summary) => [summary.name, summary]));
const buildTreeNode = (summary: CategoryRollupSummary): CategoryRollupSummary => {
const children = [...(categoryMap.get(summary.name)?.children ?? [])]
.map((childName) => summariesByName.get(childName))
.filter((child): child is CategoryRollupSummary => Boolean(child))
.sort((a, b) => a.name.localeCompare(b.name))
.map(buildTreeNode);
return children.length > 0 ? { ...summary, children } : summary;
};
const categoryTree = categories
.filter((summary) => !summary.parent)
.sort((a, b) => a.name.localeCompare(b.name))
.map(buildTreeNode);

return { categories, category_tree: categoryTree };
}

async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) {
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
Expand Down Expand Up @@ -2449,7 +2544,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont
}
entry.tests.push({
test_id: r.testId,
...(r.category && { category: r.category }),
...(r.category && { category: normalizeCategoryPath(r.category) }),
score: r.score,
passed,
execution_status: r.executionStatus,
Expand All @@ -2459,7 +2554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont
// Per-run accumulation. Dedupe tests within the run by last-wins.
runTestMap.set(r.testId, {
test_id: r.testId,
...(r.category && { category: r.category }),
...(r.category && { category: normalizeCategoryPath(r.category) }),
score: r.score,
passed,
execution_status: r.executionStatus,
Expand Down
118 changes: 107 additions & 11 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1457,17 +1457,113 @@ describe('serve app', () => {
suite_count: number;
}>;
};
expect(categoriesData.categories).toEqual([
{
name: 'runtime',
total: 3,
passed: 1,
failed: 1,
avg_score: 0.75,
execution_error_count: 1,
suite_count: 1,
},
]);
expect(categoriesData.categories).toHaveLength(1);
expect(categoriesData.categories[0]).toMatchObject({
name: 'runtime',
total: 3,
passed: 1,
failed: 1,
avg_score: 0.75,
execution_error_count: 1,
suite_count: 1,
});
});

it('returns hierarchical category rollups and descendant category drilldown', async () => {
const runsDir = localResultsExperimentDir(tempDir);
mkdirSync(runsDir, { recursive: true });
const filename = '2026-03-25T10-30-00-000Z';
const runDir = path.join(runsDir, filename);
mkdirSync(runDir, { recursive: true });
writeFileSync(
path.join(runDir, 'index.jsonl'),
toJsonl(
{
...RESULT_A,
test_id: 'network-pass',
suite: 'network-suite',
category: 'security/network',
score: 1,
},
{
...RESULT_B,
test_id: 'security-fail',
suite: 'root-suite',
category: 'security',
score: 0,
},
{
...RESULT_A,
test_id: 'flat-pass',
suite: 'legacy-suite',
category: 'legacy-flat',
score: 1,
},
),
);

const app = createApp([], tempDir, tempDir, undefined, { studioDir });

const categoriesRes = await app.request(`/api/runs/${filename}/categories`);
expect(categoriesRes.status).toBe(200);
const categoriesData = (await categoriesRes.json()) as {
categories: Array<{
name: string;
parent?: string;
total: number;
passed: number;
failed: number;
child_count?: number;
}>;
category_tree?: Array<{ name: string; children?: Array<{ name: string }> }>;
};

expect(categoriesData.categories).toEqual(
expect.arrayContaining([
expect.objectContaining({
name: 'security',
total: 2,
passed: 1,
failed: 1,
child_count: 1,
}),
expect.objectContaining({
name: 'security/network',
parent: 'security',
total: 1,
passed: 1,
failed: 0,
}),
expect.objectContaining({
name: 'legacy-flat',
total: 1,
passed: 1,
failed: 0,
}),
]),
);
expect(categoriesData.category_tree).toEqual(
expect.arrayContaining([
expect.objectContaining({
name: 'security',
children: [expect.objectContaining({ name: 'security/network' })],
}),
]),
);

const suitesRes = await app.request(
`/api/runs/${filename}/categories/${encodeURIComponent('security')}/suites`,
);
expect(suitesRes.status).toBe(200);
const suitesData = (await suitesRes.json()) as {
suites: Array<{ name: string; total: number }>;
};
expect(suitesData.suites).toEqual(
expect.arrayContaining([
expect.objectContaining({ name: 'network-suite', total: 1 }),
expect.objectContaining({ name: 'root-suite', total: 1 }),
]),
);
});

it('infers the experiment name from the run id when live results have not written it yet', async () => {
Expand Down
Loading