From 8b60cd65519e0ce0eae9981b70c59ad8aad504e7 Mon Sep 17 00:00:00 2001 From: xiaowuc1 Date: Tue, 2 Jun 2026 18:22:28 -0700 Subject: [PATCH] 0.1.2 --- .github/workflows/plugin-sanity.yml | 104 +++ plugins/docent/.codex-plugin/plugin.json | 2 +- plugins/docent/.mcp.json | 2 +- plugins/docent/skills/docent/SKILL.md | 2 +- plugins/docent/skills/docent/analysis.md | 8 +- plugins/docent/skills/docent/dql-reference.md | 3 +- .../skills/docent/ingestion-reference.md | 472 ++++++++++ plugins/docent/skills/docent/ingestion.md | 861 +++--------------- .../skills/docent/readings-reference.md | 6 +- plugins/docent/skills/docent/report.md | 11 +- 10 files changed, 715 insertions(+), 756 deletions(-) create mode 100644 .github/workflows/plugin-sanity.yml create mode 100644 plugins/docent/skills/docent/ingestion-reference.md diff --git a/.github/workflows/plugin-sanity.yml b/.github/workflows/plugin-sanity.yml new file mode 100644 index 0000000..855015c --- /dev/null +++ b/.github/workflows/plugin-sanity.yml @@ -0,0 +1,104 @@ +name: Plugin sanity + +on: + push: + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + sanity: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Validate Codex plugin package + run: | + python - <<'PY' + import json + import re + from pathlib import Path + + root = Path.cwd() + + def fail(message: str) -> None: + raise SystemExit(message) + + def load_json(path: Path) -> dict: + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: + fail(f"{path} is not valid JSON: {exc}") + + marketplace = load_json(root / ".agents" / "plugins" / "marketplace.json") + entries = marketplace.get("plugins") + if not isinstance(entries, list): + fail("marketplace plugins must be a list") + + docent_entries = [entry for entry in entries if entry.get("name") == "docent"] + if len(docent_entries) != 1: + fail("marketplace must contain exactly one docent plugin entry") + + source = docent_entries[0].get("source") + if not isinstance(source, dict) or source.get("source") != "local": + fail("docent marketplace source must be local") + plugin_dir = root / source.get("path", "") + if not plugin_dir.is_dir(): + fail(f"marketplace source path does not exist: {plugin_dir}") + + manifest = load_json(plugin_dir / ".codex-plugin" / "plugin.json") + if manifest.get("name") != "docent": + fail("plugin manifest name must be docent") + + version = manifest.get("version") + if not isinstance(version, str) or not re.fullmatch(r"\d+\.\d+\.\d+", version): + fail("plugin manifest version must be plain major.minor.patch") + if manifest.get("skills") != "./skills/": + fail("plugin manifest skills must point to ./skills/") + if manifest.get("mcpServers") != "./.mcp.json": + fail("plugin manifest mcpServers must point to ./.mcp.json") + + required_files = [ + ".codex-plugin/plugin.json", + ".mcp.json", + "skills/docent/SKILL.md", + "skills/docent/analysis.md", + "skills/docent/dql-reference.md", + "skills/docent/ingestion-reference.md", + "skills/docent/ingestion.md", + "skills/docent/readings-reference.md", + "skills/docent/report.md", + ] + for rel_path in required_files: + path = plugin_dir / rel_path + if not path.is_file(): + fail(f"required plugin file is missing: {rel_path}") + if path.suffix == ".md" and not path.read_text(encoding="utf-8").strip(): + fail(f"markdown file is empty: {rel_path}") + + mcp = load_json(plugin_dir / ".mcp.json") + server = mcp.get("mcpServers", {}).get("docent") + if not isinstance(server, dict): + fail(".mcp.json must define mcpServers.docent") + if server.get("type") != "stdio" or server.get("command") != "uv": + fail("docent MCP server must run as uv stdio") + args = server.get("args") + if not isinstance(args, list) or "--from" not in args: + fail("docent MCP server args must include --from") + package = args[args.index("--from") + 1] + if package != "docent-python>=0.1.73": + fail("docent MCP server must require docent-python>=0.1.73") + + forbidden_names = {".mcp.local.json", "docent.env"} + for path in plugin_dir.rglob("*"): + if path.name in forbidden_names or path.name.startswith("docent.env."): + fail(f"local credential/config file must not be published: {path}") + + print("Codex plugin sanity checks passed") + PY diff --git a/plugins/docent/.codex-plugin/plugin.json b/plugins/docent/.codex-plugin/plugin.json index e445c1b..e00f400 100644 --- a/plugins/docent/.codex-plugin/plugin.json +++ b/plugins/docent/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "docent", - "version": "0.1.1", + "version": "0.1.2", "description": "Docent AI analysis tools for Codex.", "author": { "name": "Transluce", diff --git a/plugins/docent/.mcp.json b/plugins/docent/.mcp.json index c239498..a3f959c 100644 --- a/plugins/docent/.mcp.json +++ b/plugins/docent/.mcp.json @@ -3,7 +3,7 @@ "docent": { "type": "stdio", "command": "uv", - "args": ["tool", "run", "--from", "docent-python", "docent-mcp"] + "args": ["tool", "run", "--from", "docent-python>=0.1.73", "docent-mcp"] } } } diff --git a/plugins/docent/skills/docent/SKILL.md b/plugins/docent/skills/docent/SKILL.md index 0ec53da..8134e2d 100644 --- a/plugins/docent/skills/docent/SKILL.md +++ b/plugins/docent/skills/docent/SKILL.md @@ -17,5 +17,5 @@ This is the root skill for all Docent work. This file is just a table of content - For the Readings API (`client.read`, `client.query`, batching, prompts, clustering): `./readings-reference.md` - For DQL syntax, schemas, quirks, and example queries: `./dql-reference.md` - For the reports API: `./report.md` (only if the user explicitly asks for a report) -- For ingestion-side data-model and conversion examples: the reference and pattern sections in `./ingestion.md` +- For ingestion-side data-model and conversion examples: `./ingestion-reference.md` - SDK reference is available by visiting [our online documentation](https://docs.transluce.org/llms.txt) diff --git a/plugins/docent/skills/docent/analysis.md b/plugins/docent/skills/docent/analysis.md index f8726c1..5f55aa2 100644 --- a/plugins/docent/skills/docent/analysis.md +++ b/plugins/docent/skills/docent/analysis.md @@ -64,11 +64,11 @@ client = Docent.from_url("https://docent.transluce.org/dashboard/668354d8-...") ``` This parses the domain and collection ID from the URL automatically. -The Docent SDK can be configured by a docent.env file in the working directory. The SDK will automatically discover and load a docent.env file if it exists. You do not need to explicitly source docent.env. Config files may use INI-style `[section]` headers for multi-profile support; select a profile with `Docent(profile="my-profile")` or the `DOCENT_PROFILE` environment variable. +The Docent SDK can be configured by a `docent.env` file. The SDK searches from the current working directory upward through parent directories, then falls back to `~/.docent/docent.env` if no local file exists. You do not need to explicitly source `docent.env`. Config files may use INI-style `[section]` headers for multi-profile support; select a profile with `Docent(profile="my-profile")` or the `DOCENT_PROFILE` environment variable. If you're not sure what collection the user is talking about: * If the user provides a Docent dashboard URL (e.g., `https://docent.transluce.org/dashboard/668354d8-...`), use `Docent.from_url()` or extract the collection ID from the last path segment (the UUID). -* Otherwise, check the `docent.env` file in the working directory for `DOCENT_COLLECTION_ID`. +* Otherwise, check the SDK-discovered `docent.env` file for `DOCENT_COLLECTION_ID`. * If neither is available, ask the user to paste the collection UUID. The main Docent deployment lives at https://docent.transluce.org but the user may connect a different deployment by overriding DOCENT_FRONTEND_URL in docent.env. The Docent SDK will print out the frontend URL when it is initialized, e.g. `Authenticating Docent client with frontend_url='https://docent.transluce.org'`. If you see a different frontend URL, use that URL in place of `https://docent.transluce.org` for any links. @@ -80,7 +80,7 @@ If you run into any issues or unexpected behavior with the Docent platform, paus * If authentication fails (HTTP 401) or no API key is configured, walk the user through setup: 1. Open the API keys page for them: `open https://docent.transluce.org/settings/api-keys` (macOS) or `xdg-open https://docent.transluce.org/settings/api-keys` (Linux). 2. Ask them to create a new API key (it will start with `dk_`). - 3. Write the key to a `docent.env` file in the working directory: `DOCENT_API_KEY=dk_...` (plus `DOCENT_API_URL` and `DOCENT_FRONTEND_URL` if not using the default instance). + 3. Write the key to a local `docent.env` file or `~/.docent/docent.env`: `DOCENT_API_KEY=dk_...` (plus `DOCENT_API_URL` and `DOCENT_FRONTEND_URL` if not using the default instance). 4. Verify connectivity by constructing a `Docent()` client — the constructor validates the API key automatically. * If the SDK does not match what's documented here, check whether the SDK is up to date. * If the Docent MCP server is available but doesn't match the tools documented here, check whether the MCP server needs an upgrade (`uv tool upgrade docent`). If an upgrade was needed, ask the user to restart the session or MCP server. @@ -322,7 +322,7 @@ ORDER BY cnt DESC These are specific rules that follow from the principles above. They apply throughout the analysis: -* **Never present opaque Python computation as analysis results.** Orientation queries (Step 1) are for *your* understanding and can use `execute_dql()` and local Python. But once you move past orientation into actual analysis (Step 3), findings must go through Docent's inspectable pipeline — DQL query steps visible in the UI and LLM analyses with citable evidence. If the user's question requires categorization, comparison, or synthesis, use Docent analyses, not a Python script that outputs a table. The user has no way to verify, inspect, or drill into results that come from opaque code. Metadata aggregations via DQL are acceptable as supporting context (e.g., counts, averages), but the analytical conclusions should come from inspectable analyses the user can review in the Docent UI. +* **Never present opaque Python computation as analysis results.** Orientation queries (Step 1) are for *your* understanding and can use `execute_dql()` and local Python. But once you move past orientation into actual analysis (Step 3), findings must go through Docent's inspectable pipeline — DQL query steps visible in the UI and analysis-plan readings with citable evidence. If the user's question requires categorization, comparison, or synthesis, use Docent analyses, not a Python script that outputs a table. The user has no way to verify, inspect, or drill into results that come from opaque code. Metadata aggregations via DQL are acceptable as supporting context (e.g., counts, averages), but the analytical conclusions should come from inspectable analyses the user can review in the Docent UI. * **Don't fall back to manual synthesis when an analysis step fails.** If a synthesis step fails (e.g., context overflow), fix the analysis design (batch it, sample it, use structured aggregation) and re-submit. Do not absorb the synthesis work into opaque Python scripts or agent-side summarization — this defeats the core value of Docent's inspectable, citable analysis. If you must do agent-side aggregation as a stopgap (e.g., counting structured output fields via a query), explicitly flag to the user that this step is not inspectable in the Docent UI and offer to re-run it properly. * If the user asks you to "read the agent runs", "summarize 10 transcripts", "classify the results", or similar, that not mean that you (the coding agent) should do so directly. Prefer to do this in an analysis plan using readings. * **Be transparent about reused work.** This has two parts: diff --git a/plugins/docent/skills/docent/dql-reference.md b/plugins/docent/skills/docent/dql-reference.md index a821fe4..1ce3444 100644 --- a/plugins/docent/skills/docent/dql-reference.md +++ b/plugins/docent/skills/docent/dql-reference.md @@ -44,7 +44,6 @@ raw_rows = client.dql_result_to_dicts(result) | `transcripts` | Individual transcripts tied to an agent run; stores serialized messages and per-transcript metadata. | | `transcript_groups` | Hierarchical groupings of transcripts for runs. | | `judge_results` | Scored rubric outputs keyed by agent run and rubric version. | -| `results` | Individual LLM analysis results from result sets. | | `readings` | Reading definitions (template or scripted LLM analysis). | | `reading_results` | Results from running readings. | | `reading_result_links` | Junction table linking readings to their results. | @@ -288,7 +287,7 @@ LIMIT 50; - **Single statement**: Batches or multiple statements are rejected. - **Explicit projection**: Wildcard projections (`*`) are disallowed. List the columns you need. - **Collection scoping**: A single query can only access data within a single collection. -- **Limit enforcement**: Every query is capped at 10,000 rows. Use pagination (`OFFSET`/`LIMIT`) for larger result sets. +- **Limit enforcement**: Every query is capped at 10,000 rows. Use pagination (`OFFSET`/`LIMIT`) for larger row collections. - **JSON performance**: Heavy JSON traversal across large collections can be slow. Prefer top-level fields when available. - **Type awareness**: Cast values explicitly when precision matters. diff --git a/plugins/docent/skills/docent/ingestion-reference.md b/plugins/docent/skills/docent/ingestion-reference.md new file mode 100644 index 0000000..a07ef6e --- /dev/null +++ b/plugins/docent/skills/docent/ingestion-reference.md @@ -0,0 +1,472 @@ +# Docent Ingestion Reference + +Load this file only when you need concrete code or detailed patterns while following `./ingestion.md`. + +## Source Discovery Helpers + +Use these snippets as starting points. Adapt them to the source layout instead of treating them as required framework code. + +```python +from collections import Counter +from pathlib import Path + + +def build_folder_tree(path: str, max_depth: int = 5) -> dict | None: + path_obj = Path(path) + + def recurse(current: Path, depth: int) -> dict | None: + if depth > max_depth or not current.is_dir(): + return None + + children = {} + file_extensions = Counter() + + for item in sorted(current.iterdir()): + if item.is_dir(): + children[item.name] = recurse(item, depth + 1) + else: + file_extensions[item.suffix.lower() or "no_ext"] += 1 + + return { + "children": children, + "file_counts": dict(file_extensions), + "total_files": sum(file_extensions.values()), + } + + return recurse(path_obj, 0) + + +def find_repeatable_template(tree: dict) -> dict: + def signature(node: dict | None) -> tuple: + if node is None: + return () + child_names = tuple(sorted(node.get("children", {}).keys())) + file_exts = tuple(sorted(node.get("file_counts", {}).keys())) + return (child_names, file_exts) + + signatures = {} + + def collect(node: dict | None, path: str = "") -> None: + if node is None: + return + sig = signature(node) + signatures.setdefault(sig, []).append(path) + for name, child in node.get("children", {}).items(): + collect(child, f"{path}/{name}") + + collect(tree) + repeated = [(sig, paths) for sig, paths in signatures.items() if len(paths) > 1 and sig[0]] + if not repeated: + return {"template_structure": None, "note": "No repeating pattern found"} + + repeated.sort(key=lambda item: len(item[1]), reverse=True) + return { + "template_structure": repeated[0][0], + "instance_count": len(repeated[0][1]), + "example_paths": repeated[0][1][:3], + } + + +def detect_inspect_files(path: Path) -> list[str]: + return [str(file) for file in path.rglob("*.eval")] +``` + +```python +from pathlib import Path + + +def sample_files_strategically(path: Path, template_info: dict) -> list[Path]: + samples = [] + + for instance_path in template_info.get("example_paths", [])[:2]: + instance = path / instance_path.lstrip("/") + for subdir in ["trajs", "trajectories", "logs", "results", ""]: + candidate = instance / subdir if subdir else instance + if candidate.exists(): + samples.extend(list(candidate.glob("*.json"))[:1]) + samples.extend(list(candidate.glob("*.jsonl"))[:1]) + if samples: + break + + if not samples: + samples = list(path.rglob("*.json"))[:3] + list(path.rglob("*.jsonl"))[:2] + + return samples[:5] +``` + +```python +def infer_json_schema(data: dict | list, max_depth: int = 5) -> dict: + if max_depth == 0: + return {"type": "any", "note": "truncated"} + + if isinstance(data, dict): + return { + "type": "object", + "fields": { + key: infer_json_schema(value, max_depth - 1) + for key, value in data.items() + }, + } + + if isinstance(data, list): + if not data: + return {"type": "array", "items": "unknown"} + item_schemas = [infer_json_schema(item, max_depth - 1) for item in data[:3]] + return {"type": "array", "items": item_schemas[0], "sample_count": len(data)} + + return {"type": type(data).__name__, "example": repr(data)[:100]} +``` + +## Inspect AI Logs + +When `.eval` files are detected, prefer the built-in loader: + +```python +from inspect_ai.log import read_eval_log +from docent.loaders.load_inspect import load_inspect_log + +eval_log = read_eval_log("path/to/file.eval") +agent_runs = load_inspect_log(eval_log) +print(f"Loaded {len(agent_runs)} runs from Inspect log") +``` + +## Transcript Sanity Check Warnings + +`check_agent_runs`, `check_agent_run`, `check_transcript`, and `check_messages` +return warning-level `TranscriptCheck` objects. They do not reject data by +themselves, but ingestion scripts should treat them as conversion errors unless +the warning category is explicitly understood, documented in the ingestion plan, +and accepted by the user. + +All possible warning codes from `docent.data_models.chat.checks`: + +| Code | When it appears | Fix or acceptance guidance | +| --- | --- | --- | +| `empty_message` | A message has no visible text, structured reasoning, assistant tool calls, or tool error. | Drop source noise, or preserve omitted source data in metadata if it is important. | +| `system_message_after_conversation_start` | A system message appears after a user, assistant, or tool turn. | Move setup text into the initial system prompt, or document that the source intentionally changes instructions mid-run. | +| `conversation_starts_with_tool_message` | The first non-system message is a tool response. | Check whether the assistant tool call was omitted or split into another transcript. | +| `consecutive_assistant_messages` | Two assistant messages are adjacent. | Usually merge adjacent assistant text, reasoning blocks, and tool calls into one assistant message unless the split is intentional. | +| `consecutive_user_messages` | Two user messages are adjacent. | Check whether they are separate conversations, or merge them if the source represents one user turn in fragments. | +| `assistant_tool_calls_interrupted` | A non-tool message appears before all previous assistant tool calls receive tool responses. | Place tool responses immediately after the assistant message that requested them, before the next user or assistant turn. | +| `missing_tool_response` | An assistant tool call never receives a matching tool response by the end of the transcript. | Add a tool message with the same `tool_call_id`, or document why the source lacks the response. | +| `reasoning_embedded_as_text` | Assistant text contains reasoning markers such as ``, ``, `reasoning:`, or `thinking:` but has no structured reasoning content block. | Move reasoning into `{"type": "reasoning", "reasoning": ...}` and keep user-visible answer text in `{"type": "text", "text": ...}`. | +| `tool_call_missing_id` | An assistant tool call has a blank `id`. | Populate a stable id so the corresponding tool message can refer to it via `tool_call_id`. | +| `tool_call_missing_function` | An assistant tool call has an id but a blank function name. | Populate the tool function name if it exists in the source data. | +| `duplicate_tool_call_id_in_assistant_message` | One assistant message contains the same tool call id more than once. | Use unique tool call ids within each assistant turn. | +| `duplicate_tool_call_id` | A tool call id was already emitted by an earlier assistant message in the same transcript. | Preserve source ids only when they are globally unique per transcript; otherwise generate stable unique ids during conversion. | +| `tool_response_missing_id` | A tool message has a blank `tool_call_id`. | Set `tool_call_id` to the id of the assistant tool call that produced the response. | +| `orphan_tool_response` | A tool message references a `tool_call_id` that no previous assistant tool call emitted. | Check whether the assistant tool call was omitted, assigned a different id, or split into another transcript. | +| `duplicate_tool_response` | Multiple tool messages respond to the same `tool_call_id`. | Keep one tool response per tool call unless the source intentionally streams partial tool outputs. | +| `tool_response_function_mismatch` | A tool message function name does not match the function name on the referenced assistant tool call. | Use the function name from the assistant tool call, or document a source-specific reason for the mismatch. | + +## Base Ingestion Script Shape + +Use this shape for custom data. Fill in `load_data` and `convert_to_agent_run` based on the confirmed plan. + +```python +import os +from pathlib import Path +from typing import Any + +from docent import Docent +from docent.data_models import AgentRun, Transcript +from docent.data_models.chat import ( + check_agent_runs, + format_check_report, + parse_chat_message, +) + + +DATA_PATH = Path("path/to/data") +COLLECTION_NAME = "collection-name" +DOCENT_API_KEY = os.environ["DOCENT_API_KEY"] + + +def load_data(path: Path) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + # Implement according to the confirmed source structure. + return records + + +def convert_to_agent_run(record: dict[str, Any]) -> AgentRun: + raw_messages = record.get("messages") or record.get("traj") or [] + messages = [parse_chat_message(message) for message in raw_messages] + + transcript = Transcript( + messages=messages, + metadata={}, # transcript-level fields from the mapping + ) + + return AgentRun( + transcripts=[transcript], + metadata={ + # scores, identifiers, grouping fields, and other mapped metadata + }, + ) + + +raw_data = load_data(DATA_PATH) +print(f"Loaded {len(raw_data)} source records") + +sample_errors = [] +for index, record in enumerate(raw_data[:10]): + try: + convert_to_agent_run(record) + except Exception as exc: + sample_errors.append({"index": index, "error": str(exc)}) + +if sample_errors: + raise RuntimeError(f"Sample conversion failed: {sample_errors[:5]}") + +agent_runs = [] +conversion_errors = [] +for index, record in enumerate(raw_data): + try: + agent_runs.append(convert_to_agent_run(record)) + except Exception as exc: + conversion_errors.append({"index": index, "error": str(exc)}) + +print(f"Converted {len(agent_runs)}/{len(raw_data)} source records") +if conversion_errors: + raise RuntimeError( + "Full conversion had failures. Fix or explicitly document every skipped " + f"source record before upload. Examples: {conversion_errors[:5]}" + ) + +sanity_report = check_agent_runs(agent_runs) +print(format_check_report(sanity_report)) +if sanity_report.has_warnings: + raise RuntimeError( + "AgentRun sanity checks produced warnings. Fix conversion problems, or " + "document accepted warning categories in ingestion-plan.md and confirm " + "with the user before upload." + ) + +client = Docent(api_key=DOCENT_API_KEY) +collection_id = client.create_collection(name=COLLECTION_NAME, description="") +upload_result = client.add_agent_runs(collection_id, agent_runs) +print(upload_result) +print(f"https://docent.transluce.org/collection/{collection_id}") +``` + +## Message Parsing + +Prefer `parse_chat_message` for dictionaries: + +```python +from docent.data_models.chat import parse_chat_message + +user_msg = parse_chat_message({"role": "user", "content": "What is 2+2?"}) +assistant_msg = parse_chat_message({"role": "assistant", "content": "The answer is 4."}) +system_msg = parse_chat_message({"role": "system", "content": "You are helpful."}) +``` + +Direct construction is also available when you need precise control: + +```python +from docent.data_models.chat import AssistantMessage, SystemMessage, UserMessage + +user_msg = UserMessage(content="Hello") +assistant_msg = AssistantMessage(content="Hi", model="gpt-4") +system_msg = SystemMessage(content="You are helpful.") +``` + +## Reasoning Handling + +Pay attention to reasoning during source analysis and sample conversion. +Deterministic sanity checks catch obvious structural issues such as adjacent +assistant messages and embedded reasoning markers, but they cannot decide +whether a source's reasoning stream was represented correctly. + +- Use `ContentReasoning` for visible reasoning summaries when the source exposes + them, and place those blocks on the same `AssistantMessage` as the answer text + and tool calls they belong to. +- If the source splits reasoning into separate assistant fragments, merge those + fragments into the following assistant message unless the split is semantically + intentional. +- Do not dump opaque or encrypted reasoning into user-visible text. Omit it or + preserve source-level counts/metadata, then document the omission in + `ingestion-plan.md`. +- During the sample conversion pass, inspect reasoning and tool-call turns + manually and record any accepted omissions or source-specific handling. + +```python +from docent.data_models.chat import AssistantMessage, ContentReasoning, ContentText + +assistant_msg = AssistantMessage( + content=[ + ContentReasoning(reasoning="The model's visible reasoning summary."), + ContentText(text="The answer shown to the user."), + ], +) +``` + +## Tool Calls + +Normalize raw tool calls before parsing messages if the source format differs from Docent's expected shape. + +```python +from docent.data_models.chat import AssistantMessage, ToolCall, ToolMessage + +assistant_msg = AssistantMessage( + content="Let me search for that.", + tool_calls=[ + ToolCall( + id="call_123", + function="web_search", + arguments={"query": "weather today"}, + type="function", + ) + ], +) + +tool_msg = ToolMessage( + content="Sunny, 72F", + tool_call_id="call_123", + function="web_search", +) +``` + +```python +from typing import Any + +from docent.data_models.chat import ToolCall + + +def parse_tool_calls(raw_calls: list[dict[str, Any]]) -> list[ToolCall]: + calls = [] + for index, raw_call in enumerate(raw_calls): + function_payload = raw_call.get("function", {}) + calls.append( + ToolCall( + id=raw_call.get("id", f"call_{index}"), + function=function_payload.get("name", raw_call.get("name", "")), + arguments=function_payload.get( + "arguments", + raw_call.get("arguments", {}), + ), + type="function", + ) + ) + return calls +``` + +## Simple Flat Records + +```python +from typing import Any + +from docent.data_models import AgentRun, Transcript +from docent.data_models.chat import parse_chat_message + + +def convert_simple(record: dict[str, Any]) -> AgentRun: + messages = [parse_chat_message(message) for message in record["messages"]] + metadata = {key: value for key, value in record.items() if key != "messages"} + metadata["scores"] = {"reward": record.get("reward", 0)} + + return AgentRun( + transcripts=[Transcript(messages=messages)], + metadata=metadata, + ) +``` + +## Pass@k Evaluation + +Use `TranscriptGroup` for attempts that belong to the same task-level `AgentRun`. + +```python +from typing import Any + +from docent.data_models import AgentRun, Transcript, TranscriptGroup +from docent.data_models.chat import parse_chat_message + + +def convert_pass_at_k(task_data: dict[str, Any]) -> AgentRun: + agent_run = AgentRun( + transcripts=[Transcript(messages=[])], + metadata={"task_id": task_data["task_id"]}, + ) + + groups = [] + transcripts = [] + + for index, attempt in enumerate(task_data["attempts"]): + group = TranscriptGroup( + name=f"Attempt {index + 1}", + agent_run_id=agent_run.id, + metadata={"k": index}, + ) + groups.append(group) + + transcript = Transcript( + messages=[parse_chat_message(message) for message in attempt["messages"]], + transcript_group_id=group.id, + metadata={"attempt": index}, + ) + transcripts.append(transcript) + + agent_run.transcripts = transcripts + agent_run.transcript_groups = groups + return agent_run +``` + +## Tree Or Branching Data + +Usually ingest each branch as its own `AgentRun`. Preserve tree structure in metadata. + +```python +from docent.data_models import AgentRun + +agent_run = AgentRun( + transcripts=[transcript], + metadata={ + "root_task_id": "task_123", + "branch_id": "branch_a_1", + "parent_branch_id": "branch_a", + "branch_depth": 2, + }, +) +``` + +## Multi-Agent Data + +Use one `Transcript` per agent in the same `AgentRun` when the agents share one episode-level outcome. + +```python +from docent.data_models import AgentRun, Transcript + +agent_run = AgentRun( + transcripts=[ + Transcript(messages=agent_1_messages, metadata={"agent_id": "agent_1"}), + Transcript(messages=agent_2_messages, metadata={"agent_id": "agent_2"}), + ], + metadata={ + "episode_id": "episode_42", + "scores": {"joint_reward": 0.85}, + }, +) +``` + +## Verification Snippet + +Prefer an SDK or API count when available. If count keys differ across SDK versions, log the raw collection details and manually verify the collection page. + +```python +collection_info = client.get_collection(collection_id) +print(collection_info) + +uploaded_count = None +if collection_info: + for key in ["agent_run_count", "num_agent_runs", "n_agent_runs", "total_runs"]: + if key in collection_info: + uploaded_count = collection_info[key] + break + +print("VERIFICATION REPORT") +print(f"Source records: {len(raw_data)}") +print(f"Converted: {len(agent_runs)}") +print(f"Failed conversions: {len(conversion_errors)}") +print(f"Uploaded count: {uploaded_count if uploaded_count is not None else 'unknown'}") +print(f"Collection URL: https://docent.transluce.org/collection/{collection_id}") +``` diff --git a/plugins/docent/skills/docent/ingestion.md b/plugins/docent/skills/docent/ingestion.md index 4209e30..d4da82b 100644 --- a/plugins/docent/skills/docent/ingestion.md +++ b/plugins/docent/skills/docent/ingestion.md @@ -3,800 +3,191 @@ name: ingestion description: Structured workflow for ingesting agent run data into Docent. Use when the user wants to upload evaluation logs or agent transcripts to Docent. Triggers on phrases like "ingest into Docent", "upload to Docent", "import runs to Docent", or when working with agent evaluation data that needs to be loaded into Docent for analysis. --- -# **Docent Ingestion Skill** +# Docent Ingestion Skill -This skill provides a structured workflow for converting transcripts and evaluation logs into the correct format for ingestion to Docent, an agent analysis tool. +Use this workflow to convert local transcripts, agent logs, or evaluation traces into Docent `AgentRun` data and upload them to a Docent collection. -## **Overview of Docent** +Keep the main workflow lightweight. Load `./ingestion-reference.md` only when you need concrete SDK examples, conversion snippets, source-inspection helpers, or examples for Inspect AI, tool calls, pass@k, branching, or multi-agent data. -Docent is a trace analysis tool that helps researchers analyze and debug agents. Researchers upload a “collection” of traces (“agent runs”) into Docent, where the tool enables them to: +## Core Rules -* Engage in structured data analysis such as grouping and joining to understand trends and create charts -* Quickly view traces of interest and capture human annotations of traces through labeling and comments -* Run a semantic search over transcripts by running a user-provided query of each transcript in their collection, and the cluster the results to understand high-level patterns -* Draft, refine, and iterate with the user on detailed rubrics to capture fuzzy behaviors like sycophancy, cheating, verbosity, etc. +- Work in four stages: context, planning, ingestion, verification. +- Create and maintain `ingestion-plan.md` in the working directory. +- Do not upload until the user confirms the proposed collection name, Docent hierarchy, field mappings, and omitted data. +- Never silently skip source data. Any file or field not ingested must be documented with a reason and expected impact. +- Save ingestion code to a file such as `ingest.py` or `ingest_.py`; do not rely on one-off inline Python for the final upload path. +- Use `parse_chat_message` from the Docent SDK for transcript messages, and make deliberate role mappings when the source roles differ from Docent's supported roles. +- Run deterministic `AgentRun` sanity checks before upload and resolve obvious conversion problems. -Docent accelerates researchers by helping them form hypotheses and directing them to read the most relevant transcripts. Researchers use Docent to qualitatively explain and understand shifts in quantitative metrics. Common use cases for Docent include: +## When Triggered -* Comparing between two checkpoints to understand a regression or to understand a quantitative tradeoff in their benchmark results -* Understanding an unexpected result. For instance, investigating why a checkpoint that receives high reward from a preference model (e.g. for code quality) appears to perform poorly with real users (e.g. PRs frequently rejected for low quality) -* Surfacing previously unknown failure modes. For instance, noticing that the timeout constraint is not explicit in an evaluation, causing thinking models to perform poorly compared to their non-thinking counterparts +If the user asks to "ingest", "upload", "import", or "move" traces, transcripts, or eval logs into Docent, briefly offer this structured workflow: -## **When to Offer This Workflow** +1. Gather context and credentials. +2. Inspect the data and propose a Docent organization. +3. Write and run an ingestion script. +4. Verify uploaded counts and warnings. -**Trigger conditions**: -The user mentions phrases like "ingest transcripts into Docent", "upload to Docent", "import runs to Docent,” “move data into Docent,” “upload traces to Docent” +If the user accepts or directly asks you to proceed, start Stage 1. If they decline, work freeform. -### **Initial offer:** +## Stage 1: Context -Offer the user a structured workflow for ingesting their transcripts into Docent. Briefly explain the four stages: +Before Python work, use an existing virtual environment if present. If no environment is active and `docent-python` is unavailable, ask before installing it. -1. **Context gathering**: User provides relevant context on their data, including the path to the data, how it was produced, and what kinds of analysis they would like to do -2. **Planning**: Understand how the user’s data is organized, plan an ingestion strategy, and recommend a suggested organization in Docent to the user. Surface the plan for user approval. - 1. Examine the overall data hierarchy by mapping the directory and file structure to understand if there are recurring patterns. - 2. For individual transcripts, identify all unique formats and create a template for ingesting each one. Map out a schema of each unique transcript format and map each field to the most appropriate class in Docent. - 3. Propose an organization structure in Docent (broken down into collections, agent runs, transcript groups, and transcripts) that fits the user’s analysis needs. -3. **Ingestion:** Given the suggested organization, plan how, write, and test a script that uploads all data from the user-provided directory to Docent. -4. **Testing**: After uploading to a collection in Docent, use the Docent SDK to pull down the collection data and verify that the metadata, transcript formats, and overall organization match expectations. +Collect only what is needed to plan: -Explain that you will ingest all the data provided in a directory of the user’s choosing. Explain the ask for context on the user’s analysis: while explaining is optional, it helps structure the data in Docent. Ask if they want to try this workflow or proceed freeform. +- API key: prefer `$DOCENT_API_KEY` or an SDK-discovered `docent.env` (current directory upward, then `~/.docent/docent.env`); ask only if neither is available. +- Data path: the file or directory to ingest. +- Optional context: what produced the data and what analysis the user wants to do in Docent. -If the user declines, work freeform. If the user accepts, proceed to Stage 1\. +Create `ingestion-plan.md` with this compact structure and append findings as the workflow proceeds: -## **Stage 1: Context Gathering** - -### **Environment Setup** - -Before running any Python commands, check for and activate a virtual environment: - -```shell -if [ -d "venv" ]; then - source venv/bin/activate -elif [ -d ".venv" ]; then - source .venv/bin/activate -fi -``` - -If there is no virtual environment present, prompt the user if they want to activate one -and proceed accordingly. - -Ensure the Docent SDK is installed. The package name is `docent-python`: - -```shell -pip install docent-python -``` - -### **Gathering Information** - -Collect only the essential information needed to start planning: - -- **API Key:** Check if `$DOCENT_API_KEY` is set in the environment or in a `docent.env` file. If not, ask: What is your Docent API key? (You can find or create one at: [https://docent.transluce.org/settings/api-keys](https://docent.transluce.org/settings/api-keys)) -- **Data Path:** What is the path to the files or directory you want to ingest? - -Once you have the data path, proceed to Stage 2 to analyze the data and create an ingestion plan. You will ask the user to confirm all details (including collection name, data context, and analysis goals) after presenting the plan. - -Create `ingestion-plan.md` in the working directory to log all decisions and findings throughout the workflow. Here is an example structure: - -``` +```markdown # Docent Ingestion Plan ## Configuration -- Data path: [from user] +- Data path: +- API key source: -## File Analysis -[to be filled in Stage 2a] +## Source Analysis +- File structure: +- Detected formats: +- Expected source record count: -## Schema -[to be filled in Stage 2b] +## Docent Model Orientation +- Documentation reviewed: +- Important SDK/model assumptions: -## Data Structure Proposal -[to be filled in Stage 2c] +## Proposed Docent Structure +- Collection: +- AgentRun unit: +- TranscriptGroup usage: +- Transcript usage: ## Field Mapping -[to be filled in Stage 2c] +| Source | Docent target | Notes | +| --- | --- | --- | ## Omitted Data -[MUST document any data not ingested and why] +| Field/File | Reason | Impact | +| --- | --- | --- | -## Plan Confirmation -- Collection name: [proposed, confirmed by user] -- Data context: [your understanding, confirmed by user] -- Analysis goals: [from user] +## Confirmation +- Collection name: +- Data context: +- Analysis goals: +- User confirmed: ## Execution Log -[to be filled in Stage 3] ## Verification -[to be filled in Stage 4 - compare expected vs actual counts] +- Source records: +- Converted: +- Failed conversions: +- Uploaded: +- Sanity warnings: +- Collection URL: ``` ---- - -## **Stage 2: Planning** - -### **Stage 2a: Understanding File Structure** +## Stage 2: Planning -Build understanding of the data organization to understand holistically how the user is storing their data and why they chose to organize it that way. Consider how this reflects on how they want their data stored in Docent. You can quickly get a sense of the data by using the appropriate strategies below. +### Orient on Docent Models -#### Build Structural Tree +Before designing the ingestion shape, review the ingestion-side SDK models and docs: -You can generate a folder-only tree with the following script, to see the overall directory structure. You may want to strategically list individual files in a few folders to understand them as well. +- Online SDK documentation: https://docs.transluce.org/llms.txt +- Local examples and snippets, as needed: `./ingestion-reference.md` -```py -import os -from pathlib import Path -from collections import Counter +At minimum, understand: -def build_folder_tree(path: str, max_depth: int = 5) -> dict: - """Build a tree of folder structure, detecting patterns.""" - path = Path(path) +- `Collection`, `AgentRun`, `TranscriptGroup`, and `Transcript` +- Message classes, `parse_chat_message`, supported roles, tool calls, and tool responses +- How the source represents reasoning, such as visible reasoning text, structured + summaries, opaque blobs, or split assistant fragments +- Where structured values belong: usually `AgentRun.metadata`, `Transcript.metadata`, scores, identifiers, and grouping fields +- ID behavior: the SDK assigns `AgentRun` IDs automatically - def _recurse(p: Path, depth: int) -> dict: - if depth > max_depth or not p.is_dir(): - return None +### Analyze Source Data - children = {} - file_extensions = Counter() +Inspect the data path enough to identify the repeatable unit that should become an `AgentRun`. - for item in sorted(p.iterdir()): - if item.is_dir(): - children[item.name] = _recurse(item, depth + 1) - else: - file_extensions[item.suffix.lower() or "no_ext"] += 1 +Look for: - return { - "children": children, - "file_counts": dict(file_extensions), - "total_files": sum(file_extensions.values()), - } +- Directory organization: experiment, model, checkpoint, date, task, sample, attempt, phase +- File formats: JSON, JSONL, Inspect `.eval`, logs, configs, metadata files +- Repeated templates: the same set of files or folders repeated across samples or experiments +- Transcript fields: `messages`, `conversation`, `dialogue`, `turns`, `traj`, `trajectory` +- Score and result fields: `score`, `reward`, `accuracy`, `correct`, `success`, `metric`, `result` +- Identifiers and grouping keys: `task_id`, `sample_id`, `episode`, `run_id`, `uuid` +- Special structures: pass@k attempts, tree/branching traces, multi-agent episodes, tool call sequences - return _recurse(path, 0) -``` +If Inspect `.eval` files are present, prefer the built-in Inspect loader. For mixed or unclear data, summarize your best interpretation and ask the user to confirm before coding. -Understanding what individual files are in a few folders may also be useful. List files in key directories to understand the naming conventions and file types present. +### Propose Docent Structure -#### Detect Naming Patterns +Most Docent analysis features, including rubrics, search, and clustering, operate at the `AgentRun` level. Structure data so each `AgentRun` is a meaningful analysis unit. -Examine folder and file names to understand the organizational logic. Sample a few names at different levels of the hierarchy and reason about what they might represent. +| Level | Use | +| --- | --- | +| `Collection` | One experiment, benchmark run, dataset, or cohesive ingestion batch | +| `AgentRun` | The primary item to analyze, compare, search, label, or score | +| `TranscriptGroup` | Attempts or phases within one `AgentRun`, such as pass@k | +| `Transcript` | One conversation history; use multiple transcripts for multi-agent runs | -Common patterns to look for (as suggestions, not strict rules): +Default: if unsure, make each independent task, episode, sample, or branch its own `AgentRun` with one `Transcript`. -- **Dates:** ISO format (2024-01-15), compact (20240115), or human-readable (jan\_15) -- **Model identifiers:** Model names, versions, or checkpoints -- **Sequential numbering:** run\_001, sample\_42, task\_5, episode\_100 -- **Experiment tags:** baseline, ablation, v2, control, treatment -- **Subdirectory conventions:** trajs/, logs/, results/, metadata/, configs/ +For tree or branching data, usually ingest each branch as its own `AgentRun` and use metadata such as `root_task_id`, `branch_id`, `parent_branch_id`, and `branch_depth` to preserve relationships. -Rather than pattern-matching, describe what you observe and hypothesize about the user's organizational intent. For example: +### Confirmation Gate -- "Folders appear to be organized by date, then by model name" -- "Each subfolder contains a `trajs/` directory with JSON files and a `config.yaml`" -- "File names include what looks like a task ID followed by an attempt number" +Before writing the final upload script, present the plan and wait for user confirmation. Include: -When listing out messages, you must use `parse_chat_message` from the Docent SDK. This means that you must -make an informed decision on each role that is provided and map it to one of the supported roles in Docent, -since the data provided might not use the same roles. +- Source structure and detected data type +- Proposed collection name +- Proposed `Collection` / `AgentRun` / `TranscriptGroup` / `Transcript` structure +- Key field mappings for messages, scores, identifiers, and metadata +- Any omitted files or fields, with reason and impact +- Expected source record count, if available +- Your understanding of the data context and analysis goals -Ask the user to confirm your interpretation if uncertain. +## Stage 3: Ingestion -#### Identify Repeatable Templates +For Inspect `.eval` files, use the built-in loader and proceed directly to sanity checks. See `./ingestion-reference.md` for the import pattern. -Find the structural unit that repeats across the directory (e.g., each experiment folder has the same subdirectory structure): +For custom data: -```py -def find_repeatable_template(tree: dict) -> dict: - """Find the pattern that repeats across the directory structure.""" +1. Write an ingestion script to the filesystem. +2. Load raw source records according to the confirmed file structure. +3. Convert a small sample into `AgentRun` objects. +4. Manually inspect sample turns with reasoning and tool calls to verify reasoning + was represented, merged, or intentionally omitted according to the plan. +5. Fix sample conversion issues. +6. Convert the full dataset and record conversion failures. +7. Run `check_agent_runs(agent_runs)` and inspect the formatted report. +8. Upload only after the conversion output and warnings match the confirmed plan. - def get_structure_signature(node: dict) -> tuple: - if node is None: - return () - children = node.get("children", {}) - child_names = tuple(sorted(children.keys())) - file_exts = tuple(sorted(node.get("file_counts", {}).keys())) - return (child_names, file_exts) +If a failure is not easily recoverable, such as unexpected data shape, authentication failure, API error, or ambiguous SDK error, stop and ask the user how they want to proceed. Include the exact error and the affected file or record when possible. - signatures = {} - def collect_signatures(node: dict, path: str = ""): - if node is None: - return - sig = get_structure_signature(node) - if sig not in signatures: - signatures[sig] = [] - signatures[sig].append(path) - for name, child in node.get("children", {}).items(): - collect_signatures(child, f"{path}/{name}") +### Sanity Checks - collect_signatures(tree) +`check_agent_runs` warnings are not necessarily schema errors, but they often reveal conversion mistakes. Fix warnings caused by data shaping. For warnings that may be legitimate, summarize categories, counts, and representative examples, then ask whether they are expected. - repeated = [(sig, paths) for sig, paths in signatures.items() - if len(paths) > 1 and sig[0]] - - if repeated: - repeated.sort(key=lambda x: len(x[1]), reverse=True) - return { - "template_structure": repeated[0][0], - "instance_count": len(repeated[0][1]), - "example_paths": repeated[0][1][:3], - } - return {"template_structure": None, "note": "No repeating pattern found"} -``` +Deterministic checks do not fully validate reasoning handling. Inspect source +reasoning during sample conversion, especially when the source stores reasoning +outside normal assistant text or splits reasoning from the answer/tool-call turn. -#### Detect Inspect AI Files - -Check for Inspect AI `.eval` files, which have a dedicated loader: - -```py -def detect_inspect_files(path: Path) -> list[str]: - """Detect Inspect .eval files that can use the built-in loader.""" - return [str(f) for f in path.rglob("*.eval")] -``` - -If Inspect `.eval` files are detected, use the built-in loader (see Stage 3). - -#### Decision Point - -Based on the structural analysis, determine next steps: - -| Structure Pattern | Action | -| :---- | :---- | -| Clear repeating template with trajs/logs subdirs | Proceed to schema inference on representative samples | -| Flat directory with consistent file types | Sample files directly for schema | -| Mixed/unclear structure | Ask user for clarification | -| Inspect .eval files present | Use built-in Inspect loader | -| No recognizable data files | Ask user to confirm path | - -Log the structural analysis to `ingestion-plan.md`. - ---- - -### **Stage 2b: Schema Inference** - -Sample files strategically based on the template structure identified in Stage 2a. - -#### Strategic Sampling - -```py -def sample_files_strategically(path: Path, template_info: dict) -> list[Path]: - """Sample files from representative locations within the template structure.""" - samples = [] - - if template_info.get("example_paths"): - for instance_path in template_info["example_paths"][:2]: - instance = path / instance_path.lstrip("/") - for subdir in ["trajs", "trajectories", "logs", "results", ""]: - candidate = instance / subdir if subdir else instance - if candidate.exists(): - json_files = list(candidate.glob("*.json"))[:1] - jsonl_files = list(candidate.glob("*.jsonl"))[:1] - samples.extend(json_files + jsonl_files) - if samples: - break - - if not samples: - samples = list(path.rglob("*.json"))[:3] + list(path.rglob("*.jsonl"))[:2] - - return samples[:5] -``` - -#### Infer Schema - -```py -def infer_json_schema(data: dict | list, max_depth: int = 5) -> dict: - """Recursively infer schema from JSON data.""" - if max_depth == 0: - return {"type": "any", "note": "truncated"} - - if isinstance(data, dict): - return { - "type": "object", - "fields": { - k: infer_json_schema(v, max_depth - 1) - for k, v in data.items() - } - } - elif isinstance(data, list): - if not data: - return {"type": "array", "items": "unknown"} - item_schemas = [infer_json_schema(item, max_depth - 1) for item in data[:3]] - return {"type": "array", "items": item_schemas[0], "sample_count": len(data)} - else: - return {"type": type(data).__name__, "example": repr(data)[:100]} -``` - -#### Classify Fields - -Identify fields that indicate transcript content, scores, and metadata: - -```py -TRANSCRIPT_INDICATORS = ["messages", "conversation", "transcript", "dialogue", "turns", "traj", "trajectory"] -SCORE_INDICATORS = ["score", "reward", "accuracy", "correct", "success", "metric", "result"] -ID_INDICATORS = ["id", "task_id", "sample_id", "episode", "run_id", "uuid"] - -def classify_fields(schema: dict) -> dict: - """Classify fields by their likely purpose.""" - classified = {"transcript": [], "scores": [], "identifiers": [], "metadata": []} - - def check_field(name: str, field_schema: dict, path: str = ""): - full_path = f"{path}.{name}" if path else name - name_lower = name.lower() - - if any(ind in name_lower for ind in TRANSCRIPT_INDICATORS): - classified["transcript"].append(full_path) - elif any(ind in name_lower for ind in SCORE_INDICATORS): - classified["scores"].append(full_path) - elif any(ind in name_lower for ind in ID_INDICATORS): - classified["identifiers"].append(full_path) - else: - classified["metadata"].append(full_path) - - if field_schema.get("type") == "object": - for sub_name, sub_schema in field_schema.get("fields", {}).items(): - check_field(sub_name, sub_schema, full_path) - - for name, field_schema in schema.get("fields", {}).items(): - check_field(name, field_schema) - - return classified -``` - -Log schema and field classification to `ingestion-plan.md`. - ---- - -### **Stage 2c: Docent Organization Proposal** - -Propose how to organize the data in Docent based on the user's analysis goals and data structure. - -#### Docent Hierarchy Best Practices - -**Critical:** Most Docent analysis features (rubrics, search, clustering) operate at the **AgentRun level**. Structure data accordingly: - -| Level | Purpose | When to Use | -| :---- | :---- | :---- | -| **Collection** | One experiment, benchmark run, or dataset | Usually one per ingestion; multiple if fundamentally different experiments | -| **AgentRun** | Primary analysis unit | One per complete unit you want to analyze, compare, or score. Rubrics run here. Search returns these. | -| **TranscriptGroup** | Logical groupings within an AgentRun | Multiple attempts (pass@k), phases of a task | -| **Transcript** | One agent's conversation history | One per agent in multi-agent setups; otherwise usually one per AgentRun | - -**Default:** If unsure, make each independent task/episode/sample its own AgentRun with a single Transcript. - -**Tree/branching data:** Ingest each branch as its own Transcript in its own AgentRun. Use metadata fields to identify how branches relate to each other (e.g., `parent_branch_id`, `branch_depth`, `root_task_id`). - -#### Data Pattern to Docent Mapping - -| Data Pattern | Collection | AgentRun | TranscriptGroup | Transcript | -| :---- | :---- | :---- | :---- | :---- | -| Simple evals | experiment | sample\_id, scores | — | messages | -| Pass@k | experiment | task\_id, best\_score | attempt\_k | messages per attempt | -| Tree/branching | experiment | one per branch, with metadata linking branches | — | messages for that branch | -| Multi-agent | experiment | episode\_id, joint\_scores | — | one per agent | - -#### Field Mapping - -Map each source field to a Docent location: - -| Source Field | Target Location | Target Field | Notes | -| :---- | :---- | :---- | :---- | -| messages | Transcript.messages | — | Convert via parse\_chat\_message | -| reward | AgentRun.metadata | scores.reward | | -| task\_id | AgentRun.metadata | task\_id | | - -#### Document Omitted Data - -**CRITICAL:** If ANY data will not be ingested, document it clearly: - -| Field/File | Reason for Omission | Impact | -| :---- | :---- | :---- | -| `debug_logs/` | Contains only debug output, not agent transcripts | None | -| `raw_api_responses` | Redundant with parsed messages | Low | - -**Never silently skip data.** - -#### Present Plan for Review - -Present the complete ingestion plan to the user and ask them to confirm all details: - -1. **Directory structure discovered** - what files/folders were found -2. **Data type detected** - what format the data appears to be in -3. **Proposed Docent hierarchy** - how data will be organized into collections, agent runs, transcript groups, and transcripts -4. **Key field mappings** - which fields map to scores, metadata, messages, etc. -5. **Omitted data** (if any) - what data will not be ingested and why -6. **Collection name** - propose a name based on the data, ask user to confirm or provide a different name -7. **Data context** - summarize your understanding of what this data represents (e.g., benchmark evaluation, agent task runs, multi-agent debate). Ask the user to confirm or clarify. -8. **Analysis goals** - ask what kinds of analysis they want to do in Docent (e.g., compare two model checkpoints, find failure modes, understand a metric regression). This helps ensure the data is structured appropriately. - -Wait for the user to confirm all details before proceeding to Stage 3. - ---- - -## **Stage 3: Ingestion** - -### **Environment Setup** - -Activate virtual environment if present: - -```shell -if [ -d "venv" ]; then - source venv/bin/activate -elif [ -d ".venv" ]; then - source .venv/bin/activate -fi -``` +Document any accepted warnings in `ingestion-plan.md` with counts and justification. -### **Handle Inspect AI Files** +## Stage 4: Verification -If Inspect `.eval` files were detected, use the built-in loader: +After upload, verify and log: -```py -from inspect_ai.log import read_eval_log -from docent.loaders.load_inspect import load_inspect_log +- Source records discovered +- Records converted successfully +- Conversion failures and representative errors +- Agent runs uploaded to Docent +- Whether source, converted, and uploaded counts match expectations +- Any accepted sanity warnings +- Collection URL -eval_log = read_eval_log("path/to/file.eval") -agent_runs = load_inspect_log(eval_log) -print(f"Loaded {len(agent_runs)} runs from Inspect log") -``` - -Skip to "Upload to Docent" below. - -### **Custom Data Loading** - -For non-Inspect data, build the ingestion script incrementally. - -**Important:** Always save ingestion scripts to the filesystem (e.g., `ingest.py` or `ingest_.py`) rather than running them inline. This aids in debugging, allows for iterative refinement, and provides a record of exactly how the data was ingested. - -**Error handling:** When running the ingestion script, if you encounter a failure that does not look easily recoverable (e.g., unexpected data format, authentication errors, API errors, or unclear error messages), stop and prompt the user for guidance rather than attempting repeated fixes. Describe the error clearly and ask how they would like to proceed. - -#### Load Data - -```py -import os -import json -from pathlib import Path -from docent import Docent -from docent.data_models import AgentRun, Transcript, TranscriptGroup -from docent.data_models.chat import parse_chat_message, ToolCall - -def load_data(path: str) -> list[dict]: - """Load data based on structure identified in Stage 2a.""" - path = Path(path) - records = [] - # Implementation based on detected template structure - return records - -raw_data = load_data(data_path) -print(f"Loaded {len(raw_data)} records") -``` - -#### Conversion Function - -```py -def convert_to_agent_run(record: dict) -> AgentRun: - """Convert a single record to AgentRun.""" - raw_messages = record.get("messages") or record.get("traj") or [] - messages = [parse_chat_message(m) for m in raw_messages] - - # Handle tool calls if present - for i, msg in enumerate(raw_messages): - if msg.get("role") == "assistant" and msg.get("tool_calls"): - messages[i].tool_calls = [ - ToolCall( - id=tc.get("id", f"call_{i}"), - function=tc.get("function", {}).get("name", tc.get("name", "")), - arguments=tc.get("function", {}).get("arguments", tc.get("arguments", {})), - type="function" - ) - for tc in msg["tool_calls"] - ] - - transcript = Transcript( - messages=messages, - metadata={...} # transcript-level metadata from mapping - ) - - return AgentRun( - transcripts=[transcript], - metadata={ - "scores": {...}, # from mapping - # other metadata from mapping - } - ) -``` - -#### Validation Loop - -Test conversion on a sample before full ingestion: - -```py -errors = [] -for i, record in enumerate(raw_data[:10]): - try: - agent_run = convert_to_agent_run(record) - print(f"✓ Record {i} converted successfully") - except Exception as e: - errors.append((i, str(e))) - print(f"✗ Record {i} failed: {e}") - -if errors: - print(f"\n{len(errors)} validation errors in first 10 records") -``` - -#### Full Conversion - -```py -agent_runs = [] -conversion_errors = [] - -for i, record in enumerate(raw_data): - try: - agent_runs.append(convert_to_agent_run(record)) - except Exception as e: - conversion_errors.append({"index": i, "error": str(e)}) - -print(f"Converted {len(agent_runs)}/{len(raw_data)} records") -if conversion_errors: - print(f"Errors ({len(conversion_errors)}): {conversion_errors[:5]}...") -``` - -### **Upload to Docent** - -```py -client = Docent(api_key=DOCENT_API_KEY) - -collection_id = client.create_collection( - name=collection_name, - description="", -) -print(f"Created collection: {collection_id}") - -client.add_agent_runs(collection_id, agent_runs) -print(f"Uploaded {len(agent_runs)} runs") - -print(f"View at: https://docent.transluce.org/collection/{collection_id}") -``` - ---- - -## **Stage 4: Testing & Verification** - -Verify that the upload succeeded and counts match expectations. - -### **Count Verification** - -```py -expected_runs = len(agent_runs) -failed_conversions = len(conversion_errors) -total_source_records = len(raw_data) - -print(f"\n{'='*50}") -print("VERIFICATION REPORT") -print(f"{'='*50}") -print(f"Source records found: {total_source_records}") -print(f"Successfully converted: {expected_runs}") -print(f"Failed to convert: {failed_conversions}") - -# Verify upload via Docent SDK -try: - collection_info = client.get_collection(collection_id) - uploaded_count = collection_info.get("agent_run_count", "unknown") - print(f"Uploaded to Docent: {uploaded_count}") - - if uploaded_count != expected_runs: - print(f"⚠️ WARNING: Count mismatch! Expected {expected_runs}, got {uploaded_count}") - else: - print(f"✓ Counts match!") -except Exception as e: - print(f"Could not verify upload count via API: {e}") - print(f"Please verify manually at: https://docent.transluce.org/collection/{collection_id}") -``` - -### **Log Verification Results** - -Update `ingestion-plan.md`: - -``` -## Verification - -### Counts -- Source records: [total_source_records] -- Converted successfully: [expected_runs] -- Conversion failures: [failed_conversions] -- Uploaded to Docent: [uploaded_count] -- **Status:** [MATCH / MISMATCH] - -### Errors (if any) -[List conversion errors with record index and error message] - -### Collection URL -https://docent.transluce.org/collection/[collection_id] -``` - ---- - -## **Reference** - -See the data model guidance and examples in this file for Docent data model documentation. - -For additional guidance on Docent data models and API usage, consult the official documentation: [https://docs.transluce.org/llms.txt](https://docs.transluce.org/llms.txt) - -## **Common Patterns** - -### **Inspect AI Logs** - -When `.eval` files detected, use the built-in loader: - -```py -from inspect_ai.log import read_eval_log -from docent.loaders.load_inspect import load_inspect_log - -eval_log = read_eval_log("path/to/file.eval") -agent_runs = load_inspect_log(eval_log) -``` - -### **Parsing Chat Messages** - -Use `parse_chat_message` to convert dictionaries to proper message objects: - -```py -from docent.data_models.chat import parse_chat_message - -# From dict - automatically determines message type from "role" -msg = parse_chat_message({ - "role": "user", - "content": "What's 2+2?" -}) - -msg = parse_chat_message({ - "role": "assistant", - "content": "The answer is 4." -}) - -msg = parse_chat_message({ - "role": "system", - "content": "You are a helpful assistant." -}) - -# Direct construction is also available -from docent.data_models.chat import UserMessage, AssistantMessage, SystemMessage -msg = UserMessage(content="Hello") -msg = AssistantMessage(content="Hi!", model="gpt-4") -``` - -### **Simple Dict to AgentRun** - -A common pattern for converting flat records: - -```py -from docent.data_models import AgentRun, Transcript -from docent.data_models.chat import parse_chat_message - -def convert_simple(record: dict) -> AgentRun: - messages = [parse_chat_message(m) for m in record["messages"]] - return AgentRun( - transcripts=[Transcript(messages=messages)], - metadata={ - "scores": {"reward": record.get("reward", 0)}, - **{k: v for k, v in record.items() if k != "messages"} - } - ) -``` - -### **Tool Calls** - -Handle assistant messages with tool calls and their responses: - -```py -from docent.data_models.chat import AssistantMessage, ToolMessage, ToolCall - -# Assistant making a tool call -assistant_msg = AssistantMessage( - content="Let me search for that.", - tool_calls=[ - ToolCall( - id="call_123", - function="web_search", - arguments={"query": "weather today"}, - type="function" - ) - ] -) - -# Tool response -tool_msg = ToolMessage( - content="Sunny, 72°F", - tool_call_id="call_123", - function="web_search" -) - -# Helper to parse tool calls from raw data -def parse_tool_calls(raw_calls: list) -> list[ToolCall]: - return [ - ToolCall( - id=tc["id"], - function=tc["function"]["name"], - arguments=tc["function"].get("arguments", {}), - type="function" - ) - for tc in raw_calls - ] -``` - -### **Pass@k Evaluation** - -Use `TranscriptGroup` for attempts: - -```py -from uuid import uuid4 -from docent.data_models import AgentRun, Transcript, TranscriptGroup - -def convert_pass_at_k(task_data: dict) -> AgentRun: - agent_run_id = str(uuid4()) - groups = [] - transcripts = [] - - for k, attempt in enumerate(task_data["attempts"]): - group = TranscriptGroup( - name=f"Attempt {k+1}", - agent_run_id=agent_run_id, - metadata={"k": k} - ) - groups.append(group) - - transcript = Transcript( - messages=[parse_chat_message(m) for m in attempt["messages"]], - transcript_group_id=group.id, - metadata={"attempt": k} - ) - transcripts.append(transcript) - - return AgentRun( - id=agent_run_id, - transcripts=transcripts, - transcript_groups=groups, - metadata={"task_id": task_data["task_id"]} - ) -``` - -### **Tree/Branching** - -Ingest each branch as its own `Transcript` in its own `AgentRun`. Use metadata to link branches: - -```py -AgentRun( - transcripts=[transcript], - metadata={ - "root_task_id": "task_123", - "branch_id": "branch_a_1", - "parent_branch_id": "branch_a", - "branch_depth": 2, - } -) -``` - -### **Multi-Agent** - -One `Transcript` per agent in the same `AgentRun`: - -```py -AgentRun( - transcripts=[ - Transcript(messages=agent_1_messages, metadata={"agent_id": "agent_1"}), - Transcript(messages=agent_2_messages, metadata={"agent_id": "agent_2"}), - ], - metadata={ - "episode_id": "episode_42", - "scores": {"joint_reward": 0.85} - } -) -``` - -### **Validation** - -Always validate by rendering before upload: - -```py -try: - _ = agent_run.text # Triggers validation - print("Valid") -except Exception as e: - print(f"Invalid: {e}") -``` +If the SDK cannot verify the uploaded count, provide the collection URL and record that manual verification is needed. diff --git a/plugins/docent/skills/docent/readings-reference.md b/plugins/docent/skills/docent/readings-reference.md index 82a5225..990a475 100644 --- a/plugins/docent/skills/docent/readings-reference.md +++ b/plugins/docent/skills/docent/readings-reference.md @@ -32,8 +32,6 @@ You should feel free to iterate on your scripts, but avoid overwriting scripts w * Explore a new question on the same dataset -> create a new script * Take a different approach to the same question -> create a new script -Note: an obsolete version of the SDK provided an API called `LLMRequest`. If you encounter old code using LLMRequests, you can offer to migrate it to readings. - ## Core API ### `client.query(collection_id, dql, *, name=None) -> QueryResult` @@ -367,7 +365,7 @@ Note: analysis plans are grouped by name. ```python client.default_collection_id = "" ``` -Used as a fallback when `flush()` resolves which collection to target. Automatically set from `DOCENT_COLLECTION_ID` in `docent.env` or the environment if present. Can also be passed to the `Docent()` constructor as `collection_id`. +Used as a fallback when `flush()` resolves which collection to target. Automatically set from `DOCENT_COLLECTION_ID` in the SDK-discovered `docent.env` or the environment if present. Can also be passed to the `Docent()` constructor as `collection_id`. ### Auto-flush On first `read()` call, an `atexit` handler is registered. Disable with `client.auto_flush = False`. @@ -555,7 +553,7 @@ print(f"Proposed {len(category_names)} clusters: {', '.join(category_names)}") **Stop here.** Run this script, review the proposed clusters, and report them to the user. If the clusters look right, proceed to Phase 2. If not, adjust the summarization prompt or sample and re-run. Re-running is free for unchanged steps (results are cached). -**If something goes wrong:** Check DQL query syntax first (see `dql-reference.md` quirks). Common issues: missing `is_list=True` on aggregated columns, or an empty result set from the sample query. If the clusters are too broad or too narrow, adjust the number of requested categories in the Step 2 prompt or focus the summarization prompt on a more specific aspect of behavior. +**If something goes wrong:** Check DQL query syntax first (see `dql-reference.md` quirks). Common issues: missing `is_list=True` on aggregated columns, or no rows returned by the sample query. If the clusters are too broad or too narrow, adjust the number of requested categories in the Step 2 prompt or focus the summarization prompt on a more specific aspect of behavior. ### Phase 2: Classify using the proposed clusters diff --git a/plugins/docent/skills/docent/report.md b/plugins/docent/skills/docent/report.md index e7342f1..e8e01c2 100644 --- a/plugins/docent/skills/docent/report.md +++ b/plugins/docent/skills/docent/report.md @@ -191,7 +191,7 @@ Attributes: Behavior: Uses the page's `collection_id` automatically. Shows row count, execution time, truncation info, and a toggle to show/hide the raw DQL. Authoring guidance: -- Keep queries short, explicit, and cheap. Add `LIMIT` unless the full result set is needed. +- Keep queries short, explicit, and cheap. Add `LIMIT` unless every row is genuinely needed. - Use the body to explain why this table matters, not to restate column names. - **Key pattern**: aggregate reading results via DQL rather than stating numbers in prose. A `::dql-table` computing a distribution is always preferable to "52% are X" in text, because the reader can inspect the query. @@ -231,11 +231,7 @@ Use inline citations inside markdown sentences to link claims to specific eviden This claim is grounded in ::citation{type="reading_result" collection_id="collection-uuid" reading_result_id="reading-result-uuid"}. ``` -Use `short="true"` for a compact icon-only citation: - -```md -See ::citation{type="analysis_result" collection_id="collection-uuid" result_set_id="result-set-uuid" result_id="result-uuid" short="true"} for details. -``` +Use `short="true"` for a compact icon-only citation. Rules: - This is inline text, not a block shortcode. @@ -254,7 +250,6 @@ Rules: | Type | Required fields | Optional fields | |---|---|---| -| `analysis_result` | `result_set_id`, `result_id` | | | `reading_result` | `reading_result_id` | | | `block_content` | `agent_run_id`, `transcript_id` | `block_idx` (default `0`), `content_idx` | | `agent_run_metadata` | `agent_run_id`, `metadata_key` | | @@ -438,7 +433,7 @@ The moment you type a number in a markdown section, ask yourself: is there a DQL - Do not expect block shortcodes to work inside HTML embeds. - Do not rely on inline citations inside code fences or inline code. - Do not add `collection_id` to block shortcodes — they use the page's collection automatically. Do include `collection_id` on inline `::citation` shortcodes. -- Do not omit `LIMIT` in `::dql-table` queries unless the full result set is genuinely needed. +- Do not omit `LIMIT` in `::dql-table` queries unless every row is genuinely needed. ### Other mistakes