Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion packages/gooddata-eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,44 @@ A dataset is a folder of `.json` files, one per question:
```

Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
`search_tool`, `general_question`, `guardrail`.
`search_tool`, `general_question`, `guardrail`, `dashboard_summary`.

### `dashboard_summary` items

Summary items call the dedicated summary endpoint
(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so
they carry an extra `summary_input` block, and the `expected_output` is a
**rubric** rather than an exact answer (summaries are free text):

```json
{
"id": "summary-001",
"dataset_name": "summary_pilot",
"test_kind": "dashboard_summary",
"question": "Summarize the Sales Overview dashboard.",
"summary_input": {
"dashboard_id": "sales_overview"
},
"expected_output": {
"must_include": ["States the overall revenue trend", "Identifies the top segment"],
"must_not_include": ["Numbers or segments not present in the visualizations"],
"rubric": ["Reads as a coherent business summary"]
}
}
```

`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole
dashboard). Optional fields narrow the scope: `visualizations` (list of ids),
`filter_context` (AFM filters), `tab_id`, and `format_hint`.

The `expected_output` rubric:

- `must_include` — facts a good summary must contain; **all** must pass for the item to pass.
- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item.
- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail.

Each criterion is scored independently by the LLM judge, so `quality_score`
is the fraction of satisfied criteria.

## Supported test kinds

Expand All @@ -175,6 +212,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
| `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
| `general_question` | Text answer judged by LLM | `[llm-judge]` |
| `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` |

## Optional extras

Expand Down
31 changes: 29 additions & 2 deletions packages/gooddata-eval/src/gooddata_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,38 @@
from gooddata_eval.core.connection import ConnectionError_, resolve_connection
from gooddata_eval.core.dataset.local import load_local_dataset
from gooddata_eval.core.langfuse.sink import LangfuseSink
from gooddata_eval.core.models import DatasetItem
from gooddata_eval.core.models import ChatResult, DatasetItem
from gooddata_eval.core.reporting.console import render_comparison, render_console
from gooddata_eval.core.reporting.json_report import write_multi_model_report
from gooddata_eval.core.runner import ItemReport, run_items
from gooddata_eval.core.summary.http_client import SummaryClient
from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController

_EXIT_OK = 0
_EXIT_OPERATIONAL_ERROR = 2
_SUMMARY_TEST_KIND = "dashboard_summary"


class _RoutingBackend:
"""Dispatch each item to the right backend by test_kind.

`dashboard_summary` items go to the dedicated summary endpoint; everything
else uses the conversational chat endpoint.
"""

def __init__(self, chat: ChatClient, summary: SummaryClient):
self._chat = chat
self._summary = summary

def ask(self, item: DatasetItem) -> ChatResult:
if item.test_kind == _SUMMARY_TEST_KIND:
return self._summary.ask(item)
return self._chat.ask(item)

def close(self) -> None:
for backend in (self._chat, self._summary):
if hasattr(backend, "close"):
backend.close()


def _build_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -263,7 +287,10 @@ def on_langfuse_item_done(
) -> None:
_sink.log_item(report, dataset_item_id=report.id)

backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id)
backend = _RoutingBackend(
ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
)
try:
report = run_items(
items,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import httpx

from gooddata_eval.core.models import ChatResult
from gooddata_eval.core.models import ChatResult, DatasetItem

SSE_DATA_PREFIX = "data: "

Expand Down Expand Up @@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
resp.raise_for_status()
return parse_sse_lines(resp.iter_lines())

def ask(self, question: str) -> ChatResult:
def ask(self, item: DatasetItem) -> ChatResult:
"""Run one single-turn conversation: create, send, parse, clean up."""
conversation_id = self._create_conversation()
try:
return self._send_message(conversation_id, question)
return self._send_message(conversation_id, item.question)
finally:
self._delete_conversation(conversation_id)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import httpx

from gooddata_eval.core.models import DatasetItem
from gooddata_eval.core.models import DatasetItem, SummaryInput

_DEFAULT_HOST = "https://cloud.langfuse.com"
_PAGE_SIZE = 100
Expand Down Expand Up @@ -47,6 +47,24 @@ def _question_from_input(raw_input: Any) -> str:
raise ValueError(f"Unsupported Langfuse item input shape: {raw_input!r}")


def _summary_input_from_raw(raw: dict, expected_output: Any) -> SummaryInput | None:
"""Locate a dashboard_summary item's `summary_input`.

Langfuse items have no dedicated field for it, so accept it (in priority
order) from the item input object, the item metadata, or the expectedOutput.
"""
candidate: Any = None
raw_input = raw.get("input")
metadata = raw.get("metadata")
if isinstance(raw_input, dict) and isinstance(raw_input.get("summary_input"), dict):
candidate = raw_input["summary_input"]
elif isinstance(metadata, dict) and isinstance(metadata.get("summary_input"), dict):
candidate = metadata["summary_input"]
elif isinstance(expected_output, dict) and isinstance(expected_output.get("summary_input"), dict):
candidate = expected_output["summary_input"]
return SummaryInput.model_validate(candidate) if candidate is not None else None


def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetItem:
"""Map a Langfuse REST API dataset-item dict to a DatasetItem."""
# REST API returns camelCase: expectedOutput, not expected_output
Expand All @@ -60,6 +78,7 @@ def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetIt
test_kind=resolved_kind,
question=_question_from_input(raw.get("input")),
expected_output=expected_output,
summary_input=_summary_input_from_raw(raw, expected_output),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@
)
}

# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra.
# Their modules are imported lazily on first use so the CLI starts without openai.
# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the
# [llm-judge] extra. Their modules are imported lazily on first use so the CLI
# starts without openai.
_LAZY_EVALUATOR_MODULES: dict[str, str] = {
"general_question": "gooddata_eval.core.evaluators.general_question",
"guardrail": "gooddata_eval.core.evaluators.guardrail",
"dashboard_summary": "gooddata_eval.core.evaluators.summary",
}
_LAZY_EVALUATOR_CLASSES: dict[str, str] = {
"general_question": "GeneralQuestionEvaluator",
"guardrail": "GuardrailEvaluator",
"dashboard_summary": "DashboardSummaryEvaluator",
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# (C) 2026 GoodData Corporation
"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.

Summaries are free text, so we do not match strings. Instead, `expected_output`
is a rubric of checkable criteria:

{
"must_include": ["...facts a good summary must contain..."],
"must_not_include": ["...things a good summary must avoid (hallucinations)..."],
"rubric": ["...soft quality dimensions..."]
}

Each criterion is scored independently by the judge (True/False), so the
runner's `quality_score` becomes the fraction of satisfied criteria. The item
*passes* only when every `must_include` is satisfied and no `must_not_include`
is violated; `rubric` items contribute to quality but do not gate pass/fail.

As a fallback, a non-dict `expected_output` is treated as a single rubric
criterion (same behaviour as `general_question`).
"""

from typing import Any

from gooddata_eval.core.evaluators._llm_judge import LLMJudge
from gooddata_eval.core.evaluators._text_utils import extract_text
from gooddata_eval.core.evaluators.base import ItemEvaluation
from gooddata_eval.core.models import ChatResult, DatasetItem

_POSITIVE_STEPS = [
"Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
"Read the ACTUAL OUTPUT (the generated summary).",
"Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
"Score 0 if the criterion is missing, contradicted, or only partially addressed.",
]

# For must_not_include we ask the judge a plain presence question and invert the
# result in code. Scoring "does the summary AVOID X?" via a field labelled
# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
# desired and flips the verdict. Detecting presence (no negation, no
# contradictory label) is far more robust.
_VIOLATION_STEPS = [
"Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
"Read the ACTUAL OUTPUT (the generated summary).",
"Score 1 if the actual output clearly exhibits the described characteristic.",
"Score 0 if it does not exhibit it.",
]


class DashboardSummaryEvaluator:
test_kind = "dashboard_summary"

def __init__(self):
self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)

@staticmethod
def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
if isinstance(expected_output, dict):
must_include = [str(c) for c in expected_output.get("must_include", [])]
must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
rubric = [str(c) for c in expected_output.get("rubric", [])]
if must_include or must_not_include or rubric:
return must_include, must_not_include, rubric
# Fallback: treat the whole expected_output as a single gating criterion
# (same pass/fail semantics as general_question).
return [str(expected_output)], [], []

def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
actual = extract_text(chat_result)
must_include, must_not_include, rubric = self._criteria(item.expected_output)

detail: dict[str, Any] = {"actual_output": actual}
passed = True

for i, criterion in enumerate(must_include):
ok, reason = self._positive_judge.score(item.question, criterion, actual)
detail[f"include_{i}"] = ok
detail[f"include_{i}_reason"] = reason
passed = passed and ok

for i, criterion in enumerate(must_not_include):
violated, reason = self._violation_judge.score(item.question, criterion, actual)
ok = not violated # True == characteristic absent == correctly avoided
detail[f"exclude_{i}"] = ok
detail[f"exclude_{i}_reason"] = reason
passed = passed and ok

for i, criterion in enumerate(rubric):
ok, reason = self._positive_judge.score(item.question, criterion, actual)
detail[f"rubric_{i}"] = ok
detail[f"rubric_{i}_reason"] = reason

bool_checks = [v for v in detail.values() if isinstance(v, bool)]
quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0

return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def _event(event_type: str, body: dict[str, Any]) -> dict[str, Any]:
"id": trace_id,
"timestamp": now,
"name": f"gd-eval: {report.question[:80]}",
# Expose the model on a first-class trace field so Langfuse
# dashboards can filter / break down by it ("Version"); trace
# metadata is not available as a breakdown dimension.
"version": self._model_id or None,
"input": {"question": report.question},
"output": report.best_detail,
"metadata": {
Expand Down
19 changes: 19 additions & 0 deletions packages/gooddata-eval/src/gooddata_eval/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,23 @@ class ChatResult(BaseModel):
tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")


class SummaryInput(BaseModel):
"""Structured input for the `dashboard_summary` test kind.

Maps onto the dedicated summary endpoint's request body
(`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
dataset; the SummaryClient maps it to the endpoint's camelCase fields.
"""

model_config = ConfigDict(extra="ignore")

dashboard_id: str
visualizations: list[str] | None = None
filter_context: list[dict] | None = None
tab_id: str | None = None
format_hint: str | None = None


class DatasetItem(BaseModel):
"""Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""

Expand All @@ -95,3 +112,5 @@ class DatasetItem(BaseModel):
test_kind: str
question: str
expected_output: Any
# Only used by the `dashboard_summary` test kind; ignored by all others.
summary_input: SummaryInput | None = None
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str
elif item.pass_at_k:
result, notes = "PASS", ""
else:
d = item.best_detail
failing = [
k
for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard")
if d.get(k) is False
]
notes = "failed: " + ", ".join(failing) if failing else "no visualization created"
# Evaluator-agnostic: report whichever boolean checks came back False
# (visualization uses metrics_correct/…; dashboard_summary uses
# include_*/exclude_*/rubric_*). Falls back to a generic message.
failing = [k for k, v in item.best_detail.items() if v is False]
notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
result = "FAIL"
latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"
Expand Down
6 changes: 4 additions & 2 deletions packages/gooddata-eval/src/gooddata_eval/core/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@


class ChatBackend(Protocol):
def ask(self, question: str) -> ChatResult: ...
# Receives the whole item so backends can use per-item context beyond the
# question text (e.g. dashboard_summary needs item.summary_input).
def ask(self, item: DatasetItem) -> ChatResult: ...


@dataclass
Expand Down Expand Up @@ -109,7 +111,7 @@ def _run_one_item(
try:
for run_index in range(1, runs + 1):
t0 = time.perf_counter()
chat_result = backend.ask(item.question)
chat_result = backend.ask(item)
evaluation = evaluator.evaluate(item, chat_result)
latency = time.perf_counter() - t0
report.runs += 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# (C) 2026 GoodData Corporation
Loading
Loading