braintrustdata · Ritwij Aryan Parmar (RitwijParmar) · Jun 4, 2026
diff --git a/py/README.md b/py/README.md
@@ -41,6 +41,48 @@ Then run:
 BRAINTRUST_API_KEY=<YOUR_API_KEY> braintrust eval tutorial_eval.py
 ```
 
+## Replay Trace Exports
+
+Use `braintrust replay` to turn a saved trace export into a local regression
+check. This is useful when you want to rerun a task or scorer against a
+production trace shape without sending a new experiment to Braintrust.
+
+```bash
+braintrust replay trace.json \
+  --task my_agent:answer \
+  --score my_scores:answer_quality \
+  --min-score answer_quality=0.85 \
+  --min-score-delta answer_quality=0 \
+  --fail-on-error \
+  --json
+```
+
+The trace file can be JSONL, a JSON list of span rows, or a JSON object with a
+`spans` field. Rows use the same fields Braintrust spans expose, including
+`span_id`, `root_span_id`, `input`, `output`, `expected`, `scores`, `metrics`,
+`metadata`, and `span_attributes`.
+
+Replay tasks receive the root span input and may also accept `expected`,
+`metadata`, and `trace` keyword arguments:
+
+```python
+def answer(input, trace):
+    return app.answer(input["messages"])
+```
+
+Scorers use the same common arguments as eval scorers:
+
+```python
+async def answer_quality(input, output, expected, trace):
+    tool_spans = await trace.get_spans(["tool"])
+    return output == expected
+```
+
+The report includes current scores, baseline scores from the original root
+span, score deltas, derived trace metrics, and metric deltas. Threshold flags
+make the command useful in CI when an agent or scorer change should not regress
+against saved production traces.
+
 ## Optional Extras
 
 Install extras as needed for specific workflows:

diff --git a/py/src/braintrust/__init__.py b/py/src/braintrust/__init__.py
@@ -84,5 +84,6 @@ def is_equal(expected, output):
 from .sandbox import RegisterSandboxResult as RegisterSandboxResult
 from .sandbox import SandboxConfig as SandboxConfig
 from .sandbox import register_sandbox as register_sandbox
+from .trace_replay import *
 from .util import BT_IS_ASYNC_ATTRIBUTE as BT_IS_ASYNC_ATTRIBUTE
 from .util import MarkAsyncWrapper as MarkAsyncWrapper
diff --git a/py/src/braintrust/cli/__main__.py b/py/src/braintrust/cli/__main__.py
@@ -5,6 +5,7 @@
 import textwrap
 import traceback
 
+from .. import trace_replay
 from . import eval, install, push
 
 
@@ -36,7 +37,7 @@ def main(args=None):
     )
     subparsers = parser.add_subparsers(help="sub-command help", dest="subcommand", required=True)
 
-    for module in [eval, install, push]:
+    for module in [eval, install, push, trace_replay]:
         module.build_parser(subparsers, parent_parser)
 
     args = parser.parse_args(args=args)

diff --git a/py/src/braintrust/test_trace_replay.py b/py/src/braintrust/test_trace_replay.py
@@ -0,0 +1,149 @@
+import json
+
+import pytest
+
+from braintrust.score import Score
+from braintrust.trace_replay import load_trace_file, replay_traces, run_cli
+
+
+def _agent_trace_rows():
+    return [
+        {
+            "span_id": "root-1",
+            "root_span_id": "root-1",
+            "is_root": True,
+            "input": {"messages": [{"role": "user", "content": "refund status"}]},
+            "output": {"answer": "refund pending"},
+            "expected": {"answer": "refund approved"},
+            "metadata": {"customer_tier": "enterprise"},
+            "scores": {"answer_match": 0.0},
+            "metrics": {"tokens": 120, "duration": 2.5},
+            "span_attributes": {"type": "task", "name": "support-agent"},
+        },
+        {
+            "span_id": "llm-1",
+            "root_span_id": "root-1",
+            "input": {"prompt": "Classify refund"},
+            "output": {"tool_call": "lookup_refund"},
+            "metrics": {"tokens": 80, "duration": 1.2},
+            "span_parents": ["root-1"],
+            "span_attributes": {"type": "llm", "name": "planner"},
+        },
+        {
+            "span_id": "tool-1",
+            "root_span_id": "root-1",
+            "input": {"order_id": "ord_123"},
+            "output": {"status": "approved"},
+            "metrics": {"duration": 0.3},
+            "span_parents": ["llm-1"],
+            "span_attributes": {"type": "tool", "name": "lookup_refund"},
+        },
+    ]
+
+
+def _approved_refund_task(input, trace, metadata=None):
+    assert metadata == {"customer_tier": "enterprise"}
+    assert trace.get_configuration()["root_span_id"] == "root-1"
+    return {"answer": "refund approved", "tool_spans": len(trace.spans)}
+
+
+async def _answer_match(input, output, expected, trace):
+    assert input["messages"][0]["content"] == "refund status"
+    llm_spans = await trace.get_spans(["llm"])
+    assert [span.span_id for span in llm_spans] == ["llm-1"]
+    return Score(name="answer_match", score=float(output["answer"] == expected["answer"]))
+
+
+def _boolean_answer_match(input, output, expected):
+    return output["answer"] == expected["answer"]
+
+
+def test_load_trace_file_accepts_wrapped_json(tmp_path):
+    trace_path = tmp_path / "trace.json"
+    trace_path.write_text(json.dumps({"spans": _agent_trace_rows()}), encoding="utf-8")
+
+    cases = load_trace_file(trace_path)
+
+    assert len(cases) == 1
+    assert cases[0].trace.root_span_id == "root-1"
+    assert cases[0].baseline_output == {"answer": "refund pending"}
+    assert cases[0].expected == {"answer": "refund approved"}
+
+
+def test_load_trace_file_accepts_jsonl(tmp_path):
+    trace_path = tmp_path / "trace.jsonl"
+    trace_path.write_text("\n".join(json.dumps(row) for row in _agent_trace_rows()), encoding="utf-8")
+
+    cases = load_trace_file(trace_path)
+
+    assert len(cases) == 1
+    assert len(cases[0].trace.spans) == 3
+
+
+@pytest.mark.asyncio
+async def test_replay_traces_runs_task_and_reports_score_deltas(tmp_path):
+    trace_path = tmp_path / "trace.json"
+    trace_path.write_text(json.dumps(_agent_trace_rows()), encoding="utf-8")
+    cases = load_trace_file(trace_path)
+
+    summary = await replay_traces(cases, task=_approved_refund_task, scorers=[_answer_match])
+
+    result = summary.results[0]
+    assert result.output == {"answer": "refund approved", "tool_spans": 3}
+    assert result.baseline_output == {"answer": "refund pending"}
+    assert result.scores == {"answer_match": 1.0}
+    assert result.baseline_scores == {"answer_match": 0.0}
+    assert result.score_deltas == {"answer_match": 1.0}
+    assert result.metrics["span_count"] == 3
+    assert result.metrics["tokens"] == 200
+    assert result.metric_deltas["tokens"] == 80
+    assert summary.score_averages == {"answer_match": 1.0}
+    assert summary.score_delta_averages == {"answer_match": 1.0}
+
+
+def test_cli_emits_json_report(tmp_path, capsys):
+    trace_path = tmp_path / "trace.json"
+    trace_path.write_text(json.dumps(_agent_trace_rows()), encoding="utf-8")
+
+    class Args:
+        trace_file = str(trace_path)
+        task = "braintrust.test_trace_replay:_approved_refund_task"
+        score = [
+            "braintrust.test_trace_replay:_answer_match",
+            "braintrust.test_trace_replay:_boolean_answer_match",
+        ]
+        json = True
+        min_score = []
+        min_score_delta = []
+        fail_on_error = False
+
+    run_cli(Args())
+
+    report = json.loads(capsys.readouterr().out)
+    assert report["summary"]["trace_count"] == 1
+    assert report["summary"]["failed_count"] == 0
+    assert report["results"][0]["score_deltas"]["answer_match"] == 1.0
+    assert report["results"][0]["scores"]["_boolean_answer_match"] == 1.0
+    assert report["checks"] == {"passed": True, "failures": []}
+
+
+def test_cli_thresholds_fail_on_regression(tmp_path, capsys):
+    trace_path = tmp_path / "trace.json"
+    trace_path.write_text(json.dumps(_agent_trace_rows()), encoding="utf-8")
+
+    class Args:
+        trace_file = str(trace_path)
+        task = None
+        score = ["braintrust.test_trace_replay:_answer_match"]
+        json = True
+        min_score = ["answer_match=0.5"]
+        min_score_delta = ["answer_match=0"]
+        fail_on_error = False
+
+    with pytest.raises(SystemExit) as exc_info:
+        run_cli(Args())
+
+    assert exc_info.value.code == 1
+    report = json.loads(capsys.readouterr().out)
+    assert report["checks"]["passed"] is False
+    assert "score 'answer_match' averaged 0.0000" in report["checks"]["failures"][0]