import os
import uuid
import requests
from braintrust import Eval, EvalCase
PROJECT = f"<project-name>"
API_KEY = os.environ["BRAINTRUST_API_KEY"]
API_URL = os.environ.get("BRAINTRUST_API_URL", "https://api.braintrust.dev")
HEADERS = {"Authorization": f"Bearer {API_KEY}"}
def exact_match(input, output, expected):
return 1.0 if output == expected else 0.0
def run_exp(name, output, base_experiment_id=None):
result = Eval(
PROJECT,
experiment_name=name,
data=[EvalCase(input="case", expected="yes")],
task=lambda input: output,
scores=[exact_match],
base_experiment_id=base_experiment_id,
)
print(f"\n{name}")
print(f" id: {result.summary.experiment_id}")
print(f" compared_to: {result.summary.comparison_experiment_name}")
print(f" scores: {result.summary.scores}")
return result
# B is the explicit baseline.
exp_b = run_exp("exp-B-baseline-score-0", "no")
# A should compare to B.
exp_a = run_exp(
"exp-A-compare-to-B-score-1",
"yes",
base_experiment_id=exp_b.summary.experiment_id,
)
# Confirm A persisted B as its base_exp_id.
resp = requests.get(
f"{API_URL}/v1/experiment/{exp_a.summary.experiment_id}",
headers=HEADERS,
timeout=30,
)
resp.raise_for_status()
record = resp.json()
print("\nPersisted base:")
print(" expected:", exp_b.summary.experiment_id)
print(" actual: ", record.get("base_exp_id"))
print("\nReturned summary comparison:")
print(" expected:", exp_b.summary.experiment_name)
print(" actual: ", exp_a.summary.comparison_experiment_name)
Persisted base:
expected: <exp_b id>
actual: <exp_b id>
Returned summary comparison:
expected: exp-B-baseline-score-0
actual: exp-B-baseline-score-0
Description
Evalstoresbase_experiment_idcorrectly on the experiment but the final summary does not pass it as the explicit comparison ID. As a result, summary comparison can fall back to project/default baseline resolution and show wrong diffs.Reproduction
Expected
exp_a should be compared to exp_b
Observed
exp_a is compared to the default baseline experiment selected in Step 2