From 86834194bb8a08947937204c57898efa03783c23 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Wed, 27 May 2026 08:22:13 +0200
Subject: [PATCH 1/8] intermediate checkpoint
---
README.md | 6 +
tests/test_cli_external_trainer_validation.py | 37 +++++
tests/test_external_trainer_discovery.py | 34 +++++
tests/test_external_utils.py | 18 +++
tests/test_openevolve_trainer.py | 114 +++++++++++++++
tests/test_resolve_external_trainers.py | 16 +++
tests/test_runner_external_mode.py | 76 ++++++++++
tests/test_textgrad_trainer.py | 101 +++++++++++++
trace_bench/cli.py | 25 +++-
trace_bench/resolve.py | 55 ++++---
trace_bench/runner.py | 13 +-
.../trainers/README_openevolve_trainer.md | 8 ++
.../trainers/README_textgrad_trainer.md | 8 ++
trace_bench/trainers/_external_utils.py | 132 +++++++++++++++++
trace_bench/trainers/openevolve_trainer.py | 136 ++++++++++++++++++
trace_bench/trainers/textgrad_trainer.py | 100 +++++++++++++
16 files changed, 854 insertions(+), 25 deletions(-)
create mode 100644 tests/test_cli_external_trainer_validation.py
create mode 100644 tests/test_external_trainer_discovery.py
create mode 100644 tests/test_external_utils.py
create mode 100644 tests/test_openevolve_trainer.py
create mode 100644 tests/test_resolve_external_trainers.py
create mode 100644 tests/test_runner_external_mode.py
create mode 100644 tests/test_textgrad_trainer.py
create mode 100644 trace_bench/trainers/README_openevolve_trainer.md
create mode 100644 trace_bench/trainers/README_textgrad_trainer.md
create mode 100644 trace_bench/trainers/_external_utils.py
create mode 100644 trace_bench/trainers/openevolve_trainer.py
create mode 100644 trace_bench/trainers/textgrad_trainer.py
diff --git a/README.md b/README.md
index 40e3aae..3cf59bc 100644
--- a/README.md
+++ b/README.md
@@ -128,3 +128,9 @@ PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q
## License
MIT
+
+## External Trainers
+
+- `DSPyTrainer` (`trace_bench/trainers/dspy_trainer.py`)
+- `TextGradTrainer` (`trace_bench/trainers/textgrad_trainer.py`)
+- `OpenEvolveTrainer` (`trace_bench/trainers/openevolve_trainer.py`)
diff --git a/tests/test_cli_external_trainer_validation.py b/tests/test_cli_external_trainer_validation.py
new file mode 100644
index 0000000..b86f303
--- /dev/null
+++ b/tests/test_cli_external_trainer_validation.py
@@ -0,0 +1,37 @@
+from trace_bench.cli import _validate_trainer_params
+from trace_bench.config import TrainerConfig
+
+
+class _FakeExternalTrainer:
+ USES_TRACE_OPTIMIZER = False
+
+ def train(
+ self,
+ guide,
+ train_dataset,
+ *,
+ iterations: int = 1,
+ ensure_improvement: bool = True,
+ verbose: bool = False,
+ **_kwargs,
+ ):
+ return {}
+
+
+def test_validate_trainer_params_uses_train_signature(monkeypatch) -> None:
+ monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer)
+ trainer = TrainerConfig(
+ id="OpenEvolveTrainer",
+ params_variants=[{"iterations": 2, "ensure_improvement": False}],
+ )
+ errors = []
+ _validate_trainer_params(trainer, errors)
+ assert errors == []
+
+
+def test_validate_trainer_params_rejects_unknown_kwarg(monkeypatch) -> None:
+ monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer)
+ trainer = TrainerConfig(id="OpenEvolveTrainer", params_variants=[{"unknown": 1}])
+ errors = []
+ _validate_trainer_params(trainer, errors)
+ assert errors == ["unknown trainer kwarg 'unknown' for OpenEvolveTrainer"]
diff --git a/tests/test_external_trainer_discovery.py b/tests/test_external_trainer_discovery.py
new file mode 100644
index 0000000..78bc5e9
--- /dev/null
+++ b/tests/test_external_trainer_discovery.py
@@ -0,0 +1,34 @@
+import importlib
+import sys
+import types
+
+from trace_bench.registry import discover_trainers
+
+
+def _install_fake_external_dependencies(monkeypatch) -> None:
+ fake_textgrad_module = types.ModuleType("opto.optimizers.textgrad")
+
+ class _FakeTextGrad:
+ def __init__(self, parameters, **_kwargs) -> None:
+ self.parameters = list(parameters)
+
+ fake_textgrad_module.TextGrad = _FakeTextGrad
+ monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_textgrad_module)
+
+ fake_openevolve_module = types.ModuleType("openevolve")
+ fake_openevolve_module.run_evolution = lambda **_kwargs: {"best_code": 'candidate = {}'}
+ monkeypatch.setitem(sys.modules, "openevolve", fake_openevolve_module)
+
+
+def test_discover_trainers_lists_new_external_trainers_when_dependencies_are_available(monkeypatch) -> None:
+ _install_fake_external_dependencies(monkeypatch)
+
+ import trace_bench.trainers.textgrad_trainer as textgrad_trainer
+ import trace_bench.trainers.openevolve_trainer as openevolve_trainer
+
+ importlib.reload(textgrad_trainer)
+ importlib.reload(openevolve_trainer)
+
+ specs = {spec.id: spec for spec in discover_trainers()}
+ assert specs["TextGradTrainer"].available is True
+ assert specs["OpenEvolveTrainer"].available is True
diff --git a/tests/test_external_utils.py b/tests/test_external_utils.py
new file mode 100644
index 0000000..d72a41e
--- /dev/null
+++ b/tests/test_external_utils.py
@@ -0,0 +1,18 @@
+from trace_bench.trainers._external_utils import apply_parameter_updates
+
+
+class _ReadOnlyDataParam:
+ def __init__(self, value: str) -> None:
+ self._data = value
+
+ @property
+ def data(self) -> str:
+ return self._data
+
+
+def test_apply_parameter_updates_falls_back_to_private_data_slot_when_data_property_has_no_setter() -> None:
+ parameter = _ReadOnlyDataParam("before")
+
+ apply_parameter_updates({parameter: "after"})
+
+ assert parameter.data == "after"
diff --git a/tests/test_openevolve_trainer.py b/tests/test_openevolve_trainer.py
new file mode 100644
index 0000000..1c4bfbd
--- /dev/null
+++ b/tests/test_openevolve_trainer.py
@@ -0,0 +1,114 @@
+import importlib
+import sys
+import types
+
+import pytest
+
+
+class _DummyParam:
+ def __init__(self, name: str, value: str) -> None:
+ self.name = name
+ self.py_name = name
+ self.data = value
+ self.trainable = True
+
+
+class _DummyAgent:
+ def __init__(self, greeting: str = "Hi") -> None:
+ self.greeting = _DummyParam("greeting", greeting)
+
+ def parameters(self):
+ return [self.greeting]
+
+ def __call__(self, query: str) -> str:
+ name = query.split()[-1].strip("!.?")
+ return f"{self.greeting.data}, {name}!"
+
+
+class _DummyGuide:
+ def __call__(self, task_input: str, response: str, task_info: str):
+ del task_input
+ return (1.0 if response == task_info else 0.0), f"expected {task_info}"
+
+
+def _import_openevolve_trainer(monkeypatch, best_code: str):
+ fake_module = types.ModuleType("openevolve")
+
+ def _run_evolution(*, initial_program, evaluator, iterations, **_kwargs):
+ del initial_program, evaluator, iterations
+ return types.SimpleNamespace(best_code=best_code)
+
+ fake_module.run_evolution = _run_evolution
+ monkeypatch.setitem(sys.modules, "openevolve", fake_module)
+ sys.modules.pop("trace_bench.trainers.openevolve_trainer", None)
+ return importlib.import_module("trace_bench.trainers.openevolve_trainer")
+
+
+def test_openevolve_trainer_updates_parameter(monkeypatch) -> None:
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='candidate = {"greeting": "Hello"}\n',
+ )
+ trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi"))
+ result = trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=1,
+ ensure_improvement=False,
+ )
+ assert result["status"] == "ok"
+ assert result["resolved_optimizer"] == "openevolve.run_evolution"
+ assert trainer.param.greeting.data == "Hello"
+
+
+def test_openevolve_trainer_rejects_worse_candidate(monkeypatch) -> None:
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='candidate = {"greeting": "Bad"}\n',
+ )
+ trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello"))
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=1,
+ ensure_improvement=True,
+ )
+ assert trainer.param.greeting.data == "Hello"
+
+
+def test_openevolve_trainer_rejects_invalid_candidate_program(monkeypatch) -> None:
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='print("bad candidate")\n',
+ )
+ trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello"))
+ with pytest.raises(ValueError, match="Candidate program"):
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=1,
+ ensure_improvement=False,
+ )
+
+
+def test_openevolve_trainer_requires_trainable_parameters(monkeypatch) -> None:
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='candidate = {"greeting": "Hello"}\n',
+ )
+
+ class _NoTrainables:
+ def parameters(self):
+ return []
+
+ trainer = trainer_module.OpenEvolveTrainer(_NoTrainables())
+ with pytest.raises(ValueError, match="no trainable parameters"):
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=1,
+ )
diff --git a/tests/test_resolve_external_trainers.py b/tests/test_resolve_external_trainers.py
new file mode 100644
index 0000000..9a7c7d6
--- /dev/null
+++ b/tests/test_resolve_external_trainers.py
@@ -0,0 +1,16 @@
+from trace_bench.resolve import resolve_trainer_kwargs
+
+
+def test_resolve_trainer_kwargs_does_not_inject_gepa_defaults_for_external_trainers() -> None:
+ assert resolve_trainer_kwargs({}, "TextGradTrainer") == {}
+ assert resolve_trainer_kwargs({"iterations": 3}, "OpenEvolveTrainer") == {
+ "iterations": 3
+ }
+
+
+def test_resolve_trainer_kwargs_preserves_gepa_defaults() -> None:
+ resolved = resolve_trainer_kwargs({}, "GEPA-UCB")
+ assert resolved["num_search_iterations"] == 1
+ assert resolved["train_batch_size"] == 2
+ assert resolved["merge_every"] == 2
+ assert resolved["pareto_subset_size"] == 2
diff --git a/tests/test_runner_external_mode.py b/tests/test_runner_external_mode.py
new file mode 100644
index 0000000..3b5b4a8
--- /dev/null
+++ b/tests/test_runner_external_mode.py
@@ -0,0 +1,76 @@
+from trace_bench.config import TrainerConfig
+from trace_bench.runner import _train_bundle
+
+
+class _DummyAgent:
+ def parameters(self):
+ return []
+
+ def __call__(self, query):
+ return query
+
+
+class _DummyGuide:
+ def __call__(self, task_input, response, task_info):
+ return 1.0, "ok"
+
+
+class _FakeExternalTrainer:
+ USES_TRACE_OPTIMIZER = False
+
+ def __init__(self, agent, logger=None):
+ del logger
+ self.param = agent
+
+ def train(self, guide, train_dataset, mode="real", **_kwargs):
+ del guide, train_dataset
+ return {"status": "ok", "resolved_optimizer": mode}
+
+
+def test_runner_passes_mode_to_external_trainers(monkeypatch) -> None:
+ monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeExternalTrainer)
+ bundle = {
+ "param": _DummyAgent(),
+ "guide": _DummyGuide(),
+ "train_dataset": {"inputs": ["x"], "infos": ["y"]},
+ "optimizer_kwargs": {},
+ "metadata": {},
+ }
+ trainer = TrainerConfig(id="FakeExternalTrainer", logger="none")
+ result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub")
+ assert result["status"] == "ok"
+ assert result["resolved_optimizer"] == "stub"
+
+
+class _FakeNonDSPyExternalTrainer(_FakeExternalTrainer):
+ FRAMEWORK = "trace"
+
+ def train(self, guide, train_dataset, mode="real", **kwargs):
+ del guide, train_dataset, mode
+ return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")}
+
+
+class _FakeDSPyExternalTrainer(_FakeExternalTrainer):
+ FRAMEWORK = "dspy"
+
+ def train(self, guide, train_dataset, mode="real", **kwargs):
+ del guide, train_dataset, mode
+ return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")}
+
+
+def test_runner_does_not_inject_dspy_stub_into_non_dspy_external_trainers(monkeypatch) -> None:
+ monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeNonDSPyExternalTrainer)
+ bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}}
+ trainer = TrainerConfig(id="FakeNonDSPyExternalTrainer", logger="none")
+ result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub")
+ assert result["status"] == "ok"
+ assert result["resolved_optimizer"] == "absent"
+
+
+def test_runner_injects_dspy_stub_only_for_dspy_external_trainers(monkeypatch) -> None:
+ monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeDSPyExternalTrainer)
+ bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}}
+ trainer = TrainerConfig(id="FakeDSPyExternalTrainer", logger="none")
+ result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub")
+ assert result["status"] == "ok"
+ assert result["resolved_optimizer"] == "stub"
diff --git a/tests/test_textgrad_trainer.py b/tests/test_textgrad_trainer.py
new file mode 100644
index 0000000..221f84c
--- /dev/null
+++ b/tests/test_textgrad_trainer.py
@@ -0,0 +1,101 @@
+import importlib
+import sys
+import types
+
+import pytest
+
+
+class _DummyParam:
+ def __init__(self, name: str, value: str) -> None:
+ self.name = name
+ self.py_name = name
+ self.data = value
+ self.trainable = True
+
+
+class _DummyAgent:
+ def __init__(self, greeting: str = "Hi") -> None:
+ self.greeting = _DummyParam("greeting", greeting)
+
+ def parameters(self):
+ return [self.greeting]
+
+ def __call__(self, query: str) -> str:
+ name = query.split()[-1].strip("!.?")
+ return f"{self.greeting.data}, {name}!"
+
+
+class _DummyGuide:
+ def __call__(self, task_input: str, response: str, task_info: str):
+ del task_input
+ return (1.0 if response == task_info else 0.0), f"expected {task_info}"
+
+
+def _import_textgrad_trainer(monkeypatch, proposal: str):
+ fake_module = types.ModuleType("opto.optimizers.textgrad")
+
+ class _FakeTextGrad:
+ def __init__(self, parameters, **_kwargs) -> None:
+ self.parameters = list(parameters)
+
+ def zero_feedback(self) -> None:
+ return None
+
+ def backward(self, target, feedback) -> None:
+ del target, feedback
+ return None
+
+ def step(self, bypassing=False, verbose=False):
+ del bypassing, verbose
+ return {self.parameters[0]: proposal}
+
+ fake_module.TextGrad = _FakeTextGrad
+ monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_module)
+ sys.modules.pop("trace_bench.trainers.textgrad_trainer", None)
+ return importlib.import_module("trace_bench.trainers.textgrad_trainer")
+
+
+def test_textgrad_trainer_updates_parameter(monkeypatch) -> None:
+ trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello")
+ trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi"))
+ result = trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ num_epochs=1,
+ batch_size=1,
+ ensure_improvement=False,
+ )
+ assert result["status"] == "ok"
+ assert result["resolved_optimizer"] == "opto.optimizers.textgrad.TextGrad"
+ assert trainer.param.greeting.data == "Hello"
+
+
+def test_textgrad_trainer_rejects_worse_candidate(monkeypatch) -> None:
+ trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Bad")
+ trainer = trainer_module.TextGradTrainer(_DummyAgent("Hello"))
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ num_epochs=1,
+ batch_size=1,
+ ensure_improvement=True,
+ )
+ assert trainer.param.greeting.data == "Hello"
+
+
+def test_textgrad_trainer_requires_trainable_parameters(monkeypatch) -> None:
+ trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello")
+
+ class _NoTrainables:
+ def parameters(self):
+ return []
+
+ trainer = trainer_module.TextGradTrainer(_NoTrainables())
+ with pytest.raises(ValueError, match="no trainable parameters"):
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ )
diff --git a/trace_bench/cli.py b/trace_bench/cli.py
index 853b9b8..745aca2 100644
--- a/trace_bench/cli.py
+++ b/trace_bench/cli.py
@@ -1,6 +1,7 @@
from __future__ import annotations
import argparse
+import inspect
import json
from datetime import datetime
from pathlib import Path
@@ -15,7 +16,7 @@
load_task_bundle,
)
from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs
-from trace_bench.runner import BenchRunner, _has_trainables
+from trace_bench.runner import BenchRunner, _has_trainables, _resolve_algorithm
from trace_bench.artifacts import init_run_dir, write_manifest
from trace_bench.ui import launch_ui
@@ -92,6 +93,25 @@ def _resolve_symbol(module_name: str, symbol: str) -> bool:
return False
+def _allowed_trainer_kwargs_for(trainer_id: str) -> set[str]:
+ """Return the trainer kwargs accepted by strict validation for a trainer id."""
+ allowed = set(_ALLOWED_TRAINER_KWARGS)
+ resolved = _resolve_algorithm(trainer_id)
+ if not isinstance(resolved, type):
+ return allowed
+
+ try:
+ signature = inspect.signature(resolved.train)
+ except (TypeError, ValueError):
+ return allowed
+
+ ignored = {"self", "guide", "train_dataset", "validate_dataset", "test_dataset", "mode"}
+ for name, parameter in signature.parameters.items():
+ if name in ignored:
+ continue
+ if parameter.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY):
+ allowed.add(name)
+ return allowed
def _normalize_logger_override(raw: str | None) -> str | None:
@@ -128,9 +148,10 @@ def _default_timeout(mode: str) -> float:
def _validate_trainer_params(trainer, errors: list[str]) -> None:
+ allowed_kwargs = _allowed_trainer_kwargs_for(trainer.id)
for params in trainer.params_variants or [{}]:
for key in params.keys():
- if key not in _ALLOWED_TRAINER_KWARGS:
+ if key not in allowed_kwargs:
errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}")
if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer):
diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py
index e285341..c173ed5 100644
--- a/trace_bench/resolve.py
+++ b/trace_bench/resolve.py
@@ -4,34 +4,55 @@
_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"}
+_GEPA_TRAINERS = {"GEPA-Base", "GEPA-UCB", "GEPA-Beam"}
def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]:
+ """Return default kwargs for built-in Trace search trainers only."""
if algo_name == "PrioritySearch":
- return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2)
+ return dict(
+ num_epochs=1,
+ num_steps=1,
+ num_batches=1,
+ num_candidates=2,
+ num_proposals=2,
+ )
if algo_name == "GEPA-Base":
return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2)
- # GEPA-UCB and GEPA-Beam use num_search_iterations
- return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2)
+ if algo_name in {"GEPA-UCB", "GEPA-Beam"}:
+ return dict(
+ num_search_iterations=1,
+ train_batch_size=2,
+ merge_every=2,
+ pareto_subset_size=2,
+ )
+ return {}
def _param_alias_map(algo_name: str) -> Dict[str, str]:
- base = {
+ alias_map = {
"threads": "num_threads",
- "ps_steps": "num_steps",
- "ps_batches": "num_batches",
- "ps_candidates": "num_candidates",
- "ps_proposals": "num_proposals",
- "ps_mem_update": "memory_update_frequency",
- "gepa_train_bs": "train_batch_size",
- "gepa_merge_every": "merge_every",
- "gepa_pareto_subset": "pareto_subset_size",
}
- if algo_name == "GEPA-Base":
- base["gepa_iters"] = "num_iters"
- else:
- base["gepa_iters"] = "num_search_iterations"
- return base
+ if algo_name == "PrioritySearch":
+ alias_map.update(
+ {
+ "ps_steps": "num_steps",
+ "ps_batches": "num_batches",
+ "ps_candidates": "num_candidates",
+ "ps_proposals": "num_proposals",
+ "ps_mem_update": "memory_update_frequency",
+ }
+ )
+ if algo_name in _GEPA_TRAINERS:
+ alias_map.update(
+ {
+ "gepa_train_bs": "train_batch_size",
+ "gepa_merge_every": "merge_every",
+ "gepa_pareto_subset": "pareto_subset_size",
+ }
+ )
+ alias_map["gepa_iters"] = "num_iters" if algo_name == "GEPA-Base" else "num_search_iterations"
+ return alias_map
def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]:
diff --git a/trace_bench/runner.py b/trace_bench/runner.py
index c5bc838..2763dab 100644
--- a/trace_bench/runner.py
+++ b/trace_bench/runner.py
@@ -512,13 +512,14 @@ def _dummy_response(*_args, **_kwargs):
uses_trace_optimizer = getattr(algo, "USES_TRACE_OPTIMIZER", True)
- # For DSPy-style external trainers: propagate mode='stub' as
- # dspy_lm='stub' so they configure DummyLM without requiring an explicit
- # dspy_lm param in the config. OpenTrace trainers do not all accept this
- # keyword, so keep the injection limited to external trainers that manage
- # their own optimization loop.
+ if not uses_trace_optimizer:
+ kwargs.setdefault("mode", mode)
+
+ # Keep backward-compatible DSPy stub support, but do not leak DSPy-only
+ # kwargs into unrelated external trainers.
if mode == "stub" and not uses_trace_optimizer:
- kwargs.setdefault("dspy_lm", "stub")
+ if getattr(algo, "FRAMEWORK", None) == "dspy":
+ kwargs.setdefault("dspy_lm", "stub")
# Pass through multi-objective config from bundle if present
objective_config = bundle.get("objective_config")
diff --git a/trace_bench/trainers/README_openevolve_trainer.md b/trace_bench/trainers/README_openevolve_trainer.md
new file mode 100644
index 0000000..e719eab
--- /dev/null
+++ b/trace_bench/trainers/README_openevolve_trainer.md
@@ -0,0 +1,8 @@
+# OpenEvolveTrainer
+
+`OpenEvolveTrainer` is an external Trace-Bench trainer wrapper for `openevolve.run_evolution`.
+
+- Evolves a **safe literal** candidate mapping of trainable parameter values.
+- Never executes candidate code via `exec`.
+- Parses candidates using `ast.parse` and `ast.literal_eval` only.
+- Can optionally keep only improving updates (`ensure_improvement=True`).
diff --git a/trace_bench/trainers/README_textgrad_trainer.md b/trace_bench/trainers/README_textgrad_trainer.md
new file mode 100644
index 0000000..13621fa
--- /dev/null
+++ b/trace_bench/trainers/README_textgrad_trainer.md
@@ -0,0 +1,8 @@
+# TextGradTrainer
+
+`TextGradTrainer` is an external Trace-Bench trainer wrapper for `opto.optimizers.textgrad.TextGrad`.
+
+- Thin wrapper around NewTrace TextGrad.
+- Supports `mode=stub` and `mode=real`.
+- Uses trainable Trace parameters only.
+- Can optionally keep only improving updates (`ensure_improvement=True`).
diff --git a/trace_bench/trainers/_external_utils.py b/trace_bench/trainers/_external_utils.py
new file mode 100644
index 0000000..fc4ebb8
--- /dev/null
+++ b/trace_bench/trainers/_external_utils.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from copy import deepcopy
+import importlib
+from typing import Any, Dict, List, Mapping, Sequence, Tuple
+
+
+def collect_trainable_parameters(model: Any) -> List[Any]:
+ """Return trainable parameter-like objects from a model or standalone parameter."""
+ if hasattr(model, "parameters") and callable(model.parameters):
+ parameters = [parameter for parameter in model.parameters() if getattr(parameter, "trainable", False)]
+ if parameters:
+ return list(parameters)
+ raise ValueError("Model.parameters() returned no trainable parameters.")
+ if getattr(model, "trainable", False) and hasattr(model, "data"):
+ return [model]
+ raise TypeError("Expected a model with parameters() or a standalone trainable parameter-like object.")
+
+
+def coerce_like(example_value: Any, candidate_value: Any) -> Any:
+ """Coerce a candidate value to the same literal-like type as the current parameter value."""
+ if isinstance(example_value, bool):
+ if not isinstance(candidate_value, bool):
+ raise TypeError("Expected a boolean candidate value.")
+ return candidate_value
+ if isinstance(example_value, int) and not isinstance(example_value, bool):
+ if isinstance(candidate_value, bool):
+ raise TypeError("Expected an integer candidate value.")
+ if isinstance(candidate_value, int):
+ return candidate_value
+ if isinstance(candidate_value, float) and candidate_value.is_integer():
+ return int(candidate_value)
+ raise TypeError("Expected an integer candidate value.")
+ if isinstance(example_value, float):
+ if isinstance(candidate_value, bool) or not isinstance(candidate_value, (int, float)):
+ raise TypeError("Expected a numeric candidate value.")
+ return float(candidate_value)
+ if isinstance(example_value, str):
+ if not isinstance(candidate_value, str):
+ raise TypeError("Expected a string candidate value.")
+ return candidate_value
+ if isinstance(example_value, list):
+ if not isinstance(candidate_value, list):
+ raise TypeError("Expected a list candidate value.")
+ return candidate_value
+ if isinstance(example_value, tuple):
+ if not isinstance(candidate_value, (list, tuple)):
+ raise TypeError("Expected a sequence candidate value.")
+ return tuple(candidate_value)
+ if isinstance(example_value, dict):
+ if not isinstance(candidate_value, dict):
+ raise TypeError("Expected a mapping candidate value.")
+ return candidate_value
+ raise TypeError(f"Unsupported trainable parameter value type: {type(example_value).__name__}.")
+
+
+def snapshot_parameter_values(parameters: Sequence[Any]) -> Dict[Any, Any]:
+ """Deep-copy the current values of the provided parameters."""
+ return {parameter: deepcopy(getattr(parameter, "data")) for parameter in parameters}
+
+
+def _set_parameter_value(parameter: Any, value: Any) -> None:
+ """Set a parameter-like object's value in a way that works across Trace variants."""
+ try:
+ setattr(parameter, "data", deepcopy(value))
+ return
+ except Exception:
+ pass
+ if hasattr(parameter, "_data"):
+ setattr(parameter, "_data", deepcopy(value))
+ return
+ raise TypeError("Parameter object does not expose a writable data field.")
+
+
+def restore_parameter_values(snapshot: Mapping[Any, Any]) -> None:
+ """Restore a parameter snapshot created by snapshot_parameter_values()."""
+ for parameter, value in snapshot.items():
+ _set_parameter_value(parameter, value)
+
+
+def apply_parameter_updates(update_dict: Mapping[Any, Any]) -> None:
+ """Apply candidate parameter updates in place."""
+ for parameter, value in update_dict.items():
+ _set_parameter_value(parameter, value)
+
+
+def score_model_on_dataset(agent: Any, guide: Any, dataset: Dict[str, Any], *, suppress_exceptions: bool = False) -> Tuple[float, List[str]]:
+ """Evaluate an agent on a Trace-Bench dataset and return mean score plus feedback strings."""
+ inputs = dataset.get("inputs") or []
+ infos = dataset.get("infos") or dataset.get("info") or []
+ if len(inputs) != len(infos):
+ raise ValueError("Dataset 'inputs' and 'infos' must have the same length.")
+ if not inputs:
+ raise ValueError("Dataset must contain at least one example.")
+
+ scores: List[float] = []
+ feedbacks: List[str] = []
+ for index, (task_input, task_info) in enumerate(zip(inputs, infos)):
+ try:
+ output = agent(task_input)
+ response = getattr(output, "data", output)
+ score, feedback = guide(task_input, response, task_info)
+ scores.append(float(score))
+ feedbacks.append(str(feedback))
+ except Exception as exc:
+ if not suppress_exceptions:
+ raise
+ scores.append(float("-inf"))
+ feedbacks.append(f"evaluation_error[{index}]: {type(exc).__name__}")
+
+ return sum(scores) / len(scores), feedbacks
+
+
+def summarize_feedback(feedbacks: Sequence[str], *, max_items: int = 3) -> str:
+ """Return a compact textual summary of the first few feedback strings."""
+ items = [str(item) for item in feedbacks[:max_items]]
+ return " | ".join(items)
+
+
+def resolve_external_trainer_base() -> type:
+ """Resolve the most compatible trainer base across OpenTrace variants."""
+ try:
+ module = importlib.import_module("opto.trainer.algorithms.algorithm")
+ except Exception:
+ return object
+
+ for class_name in ("Trainer", "AbstractAlgorithm", "Algorithm", "AlgorithmBase"):
+ trainer_base = getattr(module, class_name, None)
+ if isinstance(trainer_base, type):
+ return trainer_base
+
+ return object
diff --git a/trace_bench/trainers/openevolve_trainer.py b/trace_bench/trainers/openevolve_trainer.py
new file mode 100644
index 0000000..43b211a
--- /dev/null
+++ b/trace_bench/trainers/openevolve_trainer.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+import ast
+import inspect
+from pathlib import Path
+from pprint import pformat
+from threading import RLock
+from typing import Any, Dict, List, Optional, Union
+
+try:
+ from openevolve import run_evolution as _run_evolution
+except Exception as exc:
+ raise ImportError("OpenEvolveTrainer requires the optional 'openevolve' package.") from exc
+
+from trace_bench.trainers._external_utils import apply_parameter_updates, collect_trainable_parameters, coerce_like, resolve_external_trainer_base, restore_parameter_values, score_model_on_dataset, snapshot_parameter_values, summarize_feedback
+
+_TrainerBase = resolve_external_trainer_base()
+_EVALUATION_LOCK = RLock()
+
+def _validate_literal_value(value: Any) -> None:
+ """Ensure a parameter value round-trips through repr() and ast.literal_eval()."""
+ try:
+ ast.literal_eval(repr(value))
+ except Exception as exc:
+ raise TypeError(f"OpenEvolveTrainer supports only literal-like parameter values; got {type(value).__name__}.") from exc
+
+def _serialize_candidate_program(parameters: List[Any]) -> str:
+ """Serialize the current trainable parameter values to a safe Python literal program."""
+ payload: Dict[str, Any] = {}
+ for parameter in parameters:
+ value = getattr(parameter, "data")
+ _validate_literal_value(value)
+ payload[parameter.py_name] = value
+ return "candidate = " + pformat(payload, sort_dicts=True) + "\n"
+
+def _parse_candidate_program(program_text: str, parameters: List[Any]) -> Dict[Any, Any]:
+ """Parse a candidate program and coerce it back into parameter values."""
+ try:
+ syntax_tree = ast.parse(program_text, mode="exec")
+ except SyntaxError as exc:
+ raise ValueError("Candidate program must be valid Python.") from exc
+ if len(syntax_tree.body) != 1 or not isinstance(syntax_tree.body[0], ast.Assign):
+ raise ValueError("Candidate program must contain exactly one assignment to 'candidate'.")
+ assignment = syntax_tree.body[0]
+ if len(assignment.targets) != 1 or not isinstance(assignment.targets[0], ast.Name) or assignment.targets[0].id != "candidate":
+ raise ValueError("Candidate program must assign a literal mapping to 'candidate'.")
+ try:
+ candidate_mapping = ast.literal_eval(assignment.value)
+ except Exception as exc:
+ raise ValueError("Candidate mapping must be parseable via ast.literal_eval().") from exc
+ if not isinstance(candidate_mapping, dict):
+ raise ValueError("Candidate mapping must be a dict.")
+ expected_names = {parameter.py_name for parameter in parameters}
+ if set(candidate_mapping.keys()) != expected_names:
+ raise ValueError("Candidate mapping keys must exactly match the trainable parameter names.")
+ update_dict: Dict[Any, Any] = {}
+ for parameter in parameters:
+ update_dict[parameter] = coerce_like(getattr(parameter, "data"), candidate_mapping[parameter.py_name])
+ return update_dict
+
+def _extract_best_code(result: Any) -> str:
+ """Extract the best candidate program text from an OpenEvolve result object."""
+ if isinstance(result, dict):
+ for key in ("best_code", "code", "best_program"):
+ value = result.get(key)
+ if isinstance(value, str):
+ return value
+ for attribute in ("best_code", "code", "best_program"):
+ value = getattr(result, attribute, None)
+ if isinstance(value, str):
+ return value
+ raise ValueError("run_evolution did not return a best_code-like string.")
+
+def _filter_supported_kwargs(function: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+ """Drop kwargs that are not accepted by the target callable."""
+ try:
+ signature = inspect.signature(function)
+ except (TypeError, ValueError):
+ return dict(kwargs)
+ if any(parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in signature.parameters.values()):
+ return dict(kwargs)
+ return {key: value for key, value in kwargs.items() if key in signature.parameters}
+
+class OpenEvolveTrainer(_TrainerBase):
+ """Trace-Bench wrapper around OpenEvolve using safe literal parameter serialization."""
+
+ USES_TRACE_OPTIMIZER = False
+
+ def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None:
+ del optimizer
+ self.param = agent
+ self.logger = logger
+
+ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]:
+ """Optimize Trace parameters with OpenEvolve via a literal candidate mapping."""
+ if mode not in {"real", "stub"}:
+ raise ValueError("mode must be either 'real' or 'stub'.")
+ if iterations < 1:
+ raise ValueError("iterations must be at least 1.")
+ if mode == "stub":
+ return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"}
+
+ parameters = collect_trainable_parameters(self.param)
+ evaluation_dataset = validate_dataset or train_dataset
+ baseline_snapshot = snapshot_parameter_values(parameters)
+ baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
+
+ def evaluator(candidate_path: str) -> Dict[str, Any]:
+ program_text = Path(candidate_path).read_text(encoding="utf-8")
+ try:
+ update_dict = _parse_candidate_program(program_text, parameters)
+ except (TypeError, ValueError) as exc:
+ return {"score": float("-inf"), "feedback": str(exc)}
+ with _EVALUATION_LOCK:
+ snapshot = snapshot_parameter_values(parameters)
+ try:
+ apply_parameter_updates(update_dict)
+ score, feedbacks = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
+ finally:
+ restore_parameter_values(snapshot)
+ return {"score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}}
+
+ initial_program = _serialize_candidate_program(parameters)
+ run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False}
+ filtered_kwargs = _filter_supported_kwargs(_run_evolution, {key: value for key, value in run_kwargs.items() if value is not None})
+ result = _run_evolution(initial_program=initial_program, evaluator=evaluator, **filtered_kwargs)
+
+ best_code = _extract_best_code(result)
+ best_update = _parse_candidate_program(best_code, parameters)
+ apply_parameter_updates(best_update)
+ if ensure_improvement:
+ candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
+ if candidate_score < baseline_score + improvement_threshold:
+ restore_parameter_values(baseline_snapshot)
+
+ return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"}
diff --git a/trace_bench/trainers/textgrad_trainer.py b/trace_bench/trainers/textgrad_trainer.py
new file mode 100644
index 0000000..a8eb5e0
--- /dev/null
+++ b/trace_bench/trainers/textgrad_trainer.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, Union
+
+from opto import trace
+
+try:
+ from opto.optimizers.textgrad import TextGrad as _TraceTextGrad
+except Exception as exc:
+ raise ImportError("TextGradTrainer requires opto.optimizers.textgrad from the NewTrace fork.") from exc
+
+from trace_bench.trainers._external_utils import (
+ apply_parameter_updates,
+ collect_trainable_parameters,
+ coerce_like,
+ resolve_external_trainer_base,
+ restore_parameter_values,
+ score_model_on_dataset,
+ snapshot_parameter_values,
+)
+
+
+_TrainerBase = resolve_external_trainer_base()
+
+
+class TextGradTrainer(_TrainerBase):
+ """Trace-Bench wrapper around the Trace-native TextGrad optimizer from NewTrace."""
+
+ USES_TRACE_OPTIMIZER = False
+
+ def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None:
+ del optimizer
+ self.param = agent
+ self.logger = logger
+
+ def _normalize_updates(self, update_dict: Dict[Any, Any]) -> Dict[Any, Any]:
+ """Coerce proposed values back to the current parameter types."""
+ normalized: Dict[Any, Any] = {}
+ for parameter, candidate_value in update_dict.items():
+ normalized[parameter] = coerce_like(getattr(parameter, "data"), candidate_value)
+ return normalized
+
+ def _standard_optimization_step(self, guide: Any, task_input: Any, task_info: Any, min_score: float) -> tuple[Any, float, Any]:
+ """Run one forward/feedback step, preserving Trace execution errors as feedback."""
+ try:
+ target = self.param(task_input)
+ response = getattr(target, "data", target)
+ score, feedback = guide(task_input, response, task_info)
+ return target, float(score), feedback
+ except trace.ExecutionError as exc:
+ target = exc.exception_node
+ return target, float(min_score), target.create_feedback("full")
+
+ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]:
+ """Optimize Trace parameters with the TextGrad optimizer provided by NewTrace."""
+ if mode not in {"real", "stub"}:
+ raise ValueError("mode must be either 'real' or 'stub'.")
+ if num_epochs < 1:
+ raise ValueError("num_epochs must be at least 1.")
+ if batch_size < 1:
+ raise ValueError("batch_size must be at least 1.")
+ if mode == "stub":
+ return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"}
+
+ parameters = collect_trainable_parameters(self.param)
+ inputs = train_dataset.get("inputs") or []
+ infos = train_dataset.get("infos") or train_dataset.get("info") or []
+ if len(inputs) != len(infos):
+ raise ValueError("train_dataset 'inputs' and 'infos' must have the same length.")
+ if not inputs:
+ raise ValueError("train_dataset must contain at least one example.")
+
+ optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens)
+ for _ in range(num_epochs):
+ for start in range(0, len(inputs), batch_size):
+ batch_inputs = inputs[start : start + batch_size]
+ batch_infos = infos[start : start + batch_size]
+ evaluation_dataset = validate_dataset or {"inputs": batch_inputs, "infos": batch_infos}
+ optimizer.zero_feedback()
+ for task_input, task_info in zip(batch_inputs, batch_infos):
+ target, _score, feedback = self._standard_optimization_step(guide=guide, task_input=task_input, task_info=task_info, min_score=min_score)
+ optimizer.backward(target, feedback)
+
+ proposal = optimizer.step(bypassing=True, verbose=verbose)
+ normalized = self._normalize_updates(proposal)
+ if not normalized:
+ continue
+
+ snapshot = snapshot_parameter_values(parameters)
+ baseline_score: Optional[float] = None
+ if ensure_improvement:
+ baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
+
+ apply_parameter_updates(normalized)
+ if ensure_improvement and baseline_score is not None:
+ candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
+ if candidate_score < baseline_score + improvement_threshold:
+ restore_parameter_values(snapshot)
+
+ return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"}
From 34161d317351aad15b6ba82cc5edee8c02d20766 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Tue, 2 Jun 2026 08:03:10 +0200
Subject: [PATCH 2/8] stabilized notebook
---
...tgrad_openevolve_evaluation_notebook.ipynb | 3017 +++++++++++++++++
tests/test_dspy_trainer.py | 28 +
tests/test_llm_utils.py | 19 +
tests/test_openevolve_trainer.py | 101 +-
trace_bench/trainers/dspy_trainer.py | 2 +-
trace_bench/trainers/openevolve_trainer.py | 79 +-
6 files changed, 3237 insertions(+), 9 deletions(-)
create mode 100644 notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
create mode 100644 tests/test_dspy_trainer.py
create mode 100644 tests/test_llm_utils.py
diff --git a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
new file mode 100644
index 0000000..87d0955
--- /dev/null
+++ b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
@@ -0,0 +1,3017 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "6fa48e6e",
+ "metadata": {},
+ "source": [
+ "# Trainer comparison notebook\n",
+ "\n",
+ "This notebook validates and compares four real trainer paths:\n",
+ "\n",
+ "- `PrioritySearch` as the Trace baseline\n",
+ "- `TextGradTrainer`\n",
+ "- `OpenEvolveTrainer`\n",
+ "- `DSPyTrainer`\n",
+ "\n",
+ "It checks out the `textgrad_openevolve` branch, installs the real optional packages when needed, runs focused structural checks, and then runs a tiny real train/test comparison with OpenRouter or OpenAI.\n",
+ "The notebook assumes the `textgrad_openevolve` branch contains the trainer integration under test."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f51598c",
+ "metadata": {},
+ "source": [
+ "## What this notebook verifies\n",
+ "\n",
+ "- required trainer packages import from real installations\n",
+ "- Trace-Bench discovers the trainer classes\n",
+ "- focused tests and compile checks pass\n",
+ "- every comparison row uses three train examples and three held-out examples\n",
+ "- result tables show trainer status, optimizer identity, before/after scores, and per-example outputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b4ae0512",
+ "metadata": {},
+ "source": [
+ "## High-level interpretation guide\n",
+ "\n",
+ "Use this notebook in three layers:\n",
+ "\n",
+ "1. **Code-level correctness**\n",
+ " - Do the new trainers exist?\n",
+ " - Are they discovered?\n",
+ " - Do focused tests pass?\n",
+ "\n",
+ "2. **Behavior-level smoke checks**\n",
+ " - Do the trainer paths run against real installed packages?\n",
+ " - Do they produce comparable before/after rows?\n",
+ "\n",
+ "3. **Practical comparison**\n",
+ " - Which trainers improve on the tiny task?\n",
+ " - Which trainers complete but do not improve in this small budget?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "56d885a1",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:44.264090Z",
+ "iopub.status.busy": "2026-05-28T17:29:44.263974Z",
+ "iopub.status.idle": "2026-05-28T17:29:44.268749Z",
+ "shell.execute_reply": "2026-05-28T17:29:44.268429Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "WORKDIR = /home/xav/code/Trace-Bench\n",
+ "TRACE_BENCH_REMOTE_URL = https://github.com/doxav/Trace-Bench.git\n",
+ "TRACE_BENCH_BRANCH = textgrad_openevolve\n",
+ "TRACE_BENCH_REPO = /home/xav/code/Trace-Bench\n",
+ "NEWTRACE_REMOTE_URL = https://github.com/doxav/NewTrace.git\n",
+ "NEWTRACE_BRANCH = experimental\n",
+ "NEWTRACE_REPO = /home/xav/code/Trace-Bench/NewTrace\n",
+ "OPENEVOLVE_REMOTE_URL = https://github.com/algorithmicsuperintelligence/openevolve.git\n",
+ "OPENEVOLVE_BRANCH = main\n",
+ "OPENEVOLVE_REPO = /home/xav/code/Trace-Bench/openevolve\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import subprocess\n",
+ "from collections.abc import Sequence\n",
+ "from pathlib import Path\n",
+ "from subprocess import CompletedProcess\n",
+ "\n",
+ "WORKDIR = Path(\"/content\") if Path(\"/content\").exists() else Path.cwd()\n",
+ "CURRENT_REPO = Path.cwd()\n",
+ "TRACE_BENCH_REMOTE_URL = \"https://github.com/doxav/Trace-Bench.git\"\n",
+ "TRACE_BENCH_BRANCH = \"textgrad_openevolve\"\n",
+ "TRACE_BENCH_REPO = CURRENT_REPO if (CURRENT_REPO / \"trace_bench\").is_dir() else WORKDIR / \"Trace-Bench\"\n",
+ "NEWTRACE_REMOTE_URL = \"https://github.com/doxav/NewTrace.git\"\n",
+ "NEWTRACE_BRANCH = \"experimental\"\n",
+ "NEWTRACE_REPO = WORKDIR / \"NewTrace\"\n",
+ "OPENEVOLVE_REMOTE_URL = \"https://github.com/algorithmicsuperintelligence/openevolve.git\"\n",
+ "OPENEVOLVE_BRANCH = \"main\"\n",
+ "OPENEVOLVE_REPO = WORKDIR / \"openevolve\"\n",
+ "\n",
+ "for repo_path in (NEWTRACE_REPO, TRACE_BENCH_REPO):\n",
+ " repo_path_str = str(repo_path)\n",
+ " if repo_path_str not in sys.path:\n",
+ " sys.path.insert(0, repo_path_str)\n",
+ "\n",
+ "def run(cmd: Sequence[str | os.PathLike[str]], cwd: Path | str | None = None, check: bool = True) -> CompletedProcess[bytes]:\n",
+ " \"\"\"Run a subprocess command and echo its argv without shell interpolation.\"\"\"\n",
+ " print(\"$\", \" \".join(map(str, cmd)))\n",
+ " return subprocess.run([str(part) for part in cmd], cwd=cwd, check=check)\n",
+ "\n",
+ "def checkout_branch(repo_path: Path, remote_url: str, branch: str) -> None:\n",
+ " \"\"\"Fetch, checkout, and fast-forward a branch in an existing clone.\"\"\"\n",
+ " run([\"git\", \"fetch\", remote_url, branch], cwd=repo_path)\n",
+ " checkout = run([\"git\", \"checkout\", branch], cwd=repo_path, check=False)\n",
+ " if checkout.returncode != 0:\n",
+ " run([\"git\", \"checkout\", \"-b\", branch, \"FETCH_HEAD\"], cwd=repo_path)\n",
+ " run([\"git\", \"pull\", \"--ff-only\", remote_url, branch], cwd=repo_path)\n",
+ "\n",
+ "print(\"WORKDIR =\", WORKDIR)\n",
+ "print(\"TRACE_BENCH_REMOTE_URL =\", TRACE_BENCH_REMOTE_URL)\n",
+ "print(\"TRACE_BENCH_BRANCH =\", TRACE_BENCH_BRANCH)\n",
+ "print(\"TRACE_BENCH_REPO =\", TRACE_BENCH_REPO)\n",
+ "print(\"NEWTRACE_REMOTE_URL =\", NEWTRACE_REMOTE_URL)\n",
+ "print(\"NEWTRACE_BRANCH =\", NEWTRACE_BRANCH)\n",
+ "print(\"NEWTRACE_REPO =\", NEWTRACE_REPO)\n",
+ "print(\"OPENEVOLVE_REMOTE_URL =\", OPENEVOLVE_REMOTE_URL)\n",
+ "print(\"OPENEVOLVE_BRANCH =\", OPENEVOLVE_BRANCH)\n",
+ "print(\"OPENEVOLVE_REPO =\", OPENEVOLVE_REPO)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d7c51fb",
+ "metadata": {},
+ "source": [
+ "## 1. Clone and checkout the repositories\n",
+ "\n",
+ "This clones:\n",
+ "- `Trace-Bench` on `textgrad_openevolve`\n",
+ "- `doxav/NewTrace` on `experimental`\n",
+ "- `OpenEvolve` only if the real package is missing\n",
+ "\n",
+ "Skip this if you already have local checkouts and want to point the notebook at them manually."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "b6ca8593",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:44.270467Z",
+ "iopub.status.busy": "2026-05-28T17:29:44.270388Z",
+ "iopub.status.idle": "2026-05-28T17:29:46.996012Z",
+ "shell.execute_reply": "2026-05-28T17:29:46.995690Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Trace-Bench already exists; checking out textgrad_openevolve.\n",
+ "$ git fetch https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "From https://github.com/doxav/Trace-Bench\n",
+ " * branch textgrad_openevolve -> FETCH_HEAD\n",
+ "Already on 'textgrad_openevolve'\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ git checkout textgrad_openevolve\n",
+ "M\ttests/test_openevolve_trainer.py\n",
+ "M\ttrace_bench/trainers/openevolve_trainer.py\n",
+ "Your branch is up to date with 'origin/textgrad_openevolve'.\n",
+ "$ git pull --ff-only https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "From https://github.com/doxav/Trace-Bench\n",
+ " * branch textgrad_openevolve -> FETCH_HEAD\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Already up to date.\n",
+ "NewTrace already exists; checking out experimental.\n",
+ "$ git fetch https://github.com/doxav/NewTrace.git experimental\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "From https://github.com/doxav/NewTrace\n",
+ " * branch experimental -> FETCH_HEAD\n",
+ "Already on 'experimental'\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ git checkout experimental\n",
+ "Your branch is up to date with 'origin/experimental'.\n",
+ "$ git pull --ff-only https://github.com/doxav/NewTrace.git experimental\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Already up to date.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "From https://github.com/doxav/NewTrace\n",
+ " * branch experimental -> FETCH_HEAD\n"
+ ]
+ }
+ ],
+ "source": [
+ "if not TRACE_BENCH_REPO.exists():\n",
+ " run([\n",
+ " \"git\", \"clone\",\n",
+ " \"--branch\", TRACE_BENCH_BRANCH,\n",
+ " \"--single-branch\",\n",
+ " TRACE_BENCH_REMOTE_URL,\n",
+ " str(TRACE_BENCH_REPO),\n",
+ " ])\n",
+ "else:\n",
+ " print(f\"Trace-Bench already exists; checking out {TRACE_BENCH_BRANCH}.\")\n",
+ " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n",
+ "\n",
+ "if not NEWTRACE_REPO.exists():\n",
+ " run([\n",
+ " \"git\", \"clone\",\n",
+ " \"--branch\", NEWTRACE_BRANCH,\n",
+ " \"--single-branch\",\n",
+ " NEWTRACE_REMOTE_URL,\n",
+ " str(NEWTRACE_REPO),\n",
+ " ])\n",
+ "else:\n",
+ " print(f\"NewTrace already exists; checking out {NEWTRACE_BRANCH}.\")\n",
+ " checkout_branch(NEWTRACE_REPO, NEWTRACE_REMOTE_URL, NEWTRACE_BRANCH)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "963c01d5",
+ "metadata": {},
+ "source": [
+ "## 2. Install Python dependencies\n",
+ "\n",
+ "This installs:\n",
+ "- `NewTrace` editable\n",
+ "- `Trace-Bench` editable\n",
+ "- light dependencies needed for the focused validation notebook\n",
+ "\n",
+ "If `openevolve.run_evolution` is not importable, this clones OpenEvolve from GitHub and installs it editable."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fbae758b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:46.997122Z",
+ "iopub.status.busy": "2026-05-28T17:29:46.997049Z",
+ "iopub.status.idle": "2026-05-28T17:29:51.676176Z",
+ "shell.execute_reply": "2026-05-28T17:29:51.675385Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ /home/xav/miniconda3/bin/python -m pip install -q -U pip setuptools wheel\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai tensorboard tensorboardX scikit-learn datasets openai pandas\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench/NewTrace\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench\n"
+ ]
+ }
+ ],
+ "source": [
+ "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"pip\", \"setuptools\", \"wheel\"])\n",
+ "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n",
+ " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\",\n",
+ " \"tensorboard\", \"tensorboardX\", \"scikit-learn\", \"datasets\", \"openai\", \"pandas\"])\n",
+ "\n",
+ "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(NEWTRACE_REPO)])\n",
+ "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(TRACE_BENCH_REPO)])\n",
+ "\n",
+ "def has_real_openevolve() -> bool:\n",
+ " \"\"\"Return True only when the real OpenEvolve API is importable.\"\"\"\n",
+ " try:\n",
+ " import openevolve\n",
+ " return callable(getattr(openevolve, \"run_evolution\", None))\n",
+ " except Exception:\n",
+ " return False\n",
+ "\n",
+ "if not has_real_openevolve():\n",
+ " if not OPENEVOLVE_REPO.exists():\n",
+ " run([\n",
+ " \"git\", \"clone\",\n",
+ " \"--branch\", OPENEVOLVE_BRANCH,\n",
+ " \"--single-branch\",\n",
+ " OPENEVOLVE_REMOTE_URL,\n",
+ " str(OPENEVOLVE_REPO),\n",
+ " ])\n",
+ " else:\n",
+ " print(f\"OpenEvolve already exists; checking out {OPENEVOLVE_BRANCH}.\")\n",
+ " checkout_branch(OPENEVOLVE_REPO, OPENEVOLVE_REMOTE_URL, OPENEVOLVE_BRANCH)\n",
+ " run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(OPENEVOLVE_REPO)])\n",
+ "\n",
+ "if not has_real_openevolve():\n",
+ " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "25fd9e44",
+ "metadata": {},
+ "source": [
+ "## 3. Provider setup for real online experiments\n",
+ "\n",
+ "The real smoke comparison requires this provider setup. Structural tests can still run before a provider is configured.\n",
+ "\n",
+ "Supported:\n",
+ "- `openrouter`\n",
+ "- `openai`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0b984ab0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:51.677483Z",
+ "iopub.status.busy": "2026-05-28T17:29:51.677372Z",
+ "iopub.status.idle": "2026-05-28T17:29:51.680910Z",
+ "shell.execute_reply": "2026-05-28T17:29:51.680574Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "PROVIDER = openrouter\n",
+ "TRACE_LITELLM_MODEL = openrouter/openai/gpt-4o-mini\n",
+ "OPENAI_BASE_URL = https://openrouter.ai/api/v1\n",
+ "OPENROUTER_API_KEY configured = True\n"
+ ]
+ }
+ ],
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "def colab_secret(name: str) -> str:\n",
+ " \"\"\"Return a Colab Secret value when available, otherwise an empty string.\"\"\"\n",
+ " try:\n",
+ " from google.colab import userdata\n",
+ " except Exception:\n",
+ " return \"\"\n",
+ " try:\n",
+ " return userdata.get(name) or \"\"\n",
+ " except Exception:\n",
+ " return \"\"\n",
+ "\n",
+ "PROVIDER = \"auto\" # @param [\"auto\", \"openrouter\", \"openai\", \"none\"]\n",
+ "MODEL = \"\" # @param {type:\"string\"}\n",
+ "\n",
+ "openrouter_key = os.environ.get(\"OPENROUTER_API_KEY\") or colab_secret(\"OPENROUTER_API_KEY\")\n",
+ "openai_key = os.environ.get(\"OPENAI_API_KEY\") or colab_secret(\"OPENAI_API_KEY\")\n",
+ "MODEL = MODEL or os.environ.get(\"TRACE_LITELLM_MODEL\") or colab_secret(\"TRACE_LITELLM_MODEL\")\n",
+ "\n",
+ "if PROVIDER == \"auto\":\n",
+ " active_provider = \"openrouter\" if openrouter_key else \"openai\" if openai_key else \"none\"\n",
+ "else:\n",
+ " active_provider = PROVIDER\n",
+ "\n",
+ "if active_provider == \"openrouter\":\n",
+ " if not MODEL:\n",
+ " MODEL = \"openrouter/openai/gpt-4o-mini\"\n",
+ " if not openrouter_key:\n",
+ " openrouter_key = getpass(\"OPENROUTER_API_KEY: \")\n",
+ " if not openrouter_key:\n",
+ " raise ValueError(\"OPENROUTER_API_KEY is required when PROVIDER is openrouter.\")\n",
+ " os.environ[\"OPENROUTER_API_KEY\"] = openrouter_key\n",
+ " os.environ[\"OPENAI_API_KEY\"] = openrouter_key\n",
+ " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n",
+ " os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n",
+ " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n",
+ "elif active_provider == \"openai\":\n",
+ " if not MODEL:\n",
+ " MODEL = \"gpt-4o-mini\"\n",
+ " if not openai_key:\n",
+ " openai_key = getpass(\"OPENAI_API_KEY: \")\n",
+ " if not openai_key:\n",
+ " raise ValueError(\"OPENAI_API_KEY is required when PROVIDER is openai.\")\n",
+ " os.environ[\"OPENAI_API_KEY\"] = openai_key\n",
+ " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n",
+ "elif active_provider == \"none\":\n",
+ " print(\"Skipping online provider configuration.\")\n",
+ "else:\n",
+ " raise ValueError(f\"Unsupported PROVIDER: {PROVIDER}\")\n",
+ "\n",
+ "print(\"PROVIDER =\", active_provider)\n",
+ "print(\"TRACE_LITELLM_MODEL =\", os.environ.get(\"TRACE_LITELLM_MODEL\"))\n",
+ "print(\"OPENAI_BASE_URL =\", os.environ.get(\"OPENAI_BASE_URL\"))\n",
+ "print(\"OPENROUTER_API_KEY configured =\", bool(os.environ.get(\"OPENROUTER_API_KEY\")))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a574c62",
+ "metadata": {},
+ "source": [
+ "## 4. Sanity checks and imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "3b4768bb",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:51.682018Z",
+ "iopub.status.busy": "2026-05-28T17:29:51.681957Z",
+ "iopub.status.idle": "2026-05-28T17:29:56.379387Z",
+ "shell.execute_reply": "2026-05-28T17:29:56.378979Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "OK: opto.optimizers.textgrad\n",
+ "OK: openevolve\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "OK: dspy\n",
+ "OK: trace_bench\n",
+ "OK: trace_bench.runner\n",
+ "OK: trace_bench.registry\n",
+ "OK: trace_bench.config\n",
+ "OK: trace_bench.trainers.textgrad_trainer\n",
+ "OK: trace_bench.trainers.openevolve_trainer\n",
+ "OK: trace_bench.trainers.dspy_trainer\n",
+ "TextGrad module: /home/xav/code/Trace-Bench/NewTrace/opto/optimizers/textgrad.py\n",
+ "OpenEvolve module: /home/xav/miniconda3/lib/python3.13/site-packages/openevolve/__init__.py\n",
+ "DSPy module: /home/xav/miniconda3/lib/python3.13/site-packages/dspy/__init__.py\n"
+ ]
+ }
+ ],
+ "source": [
+ "import importlib\n",
+ "import pandas as pd\n",
+ "\n",
+ "def required_import(name: str) -> object:\n",
+ " \"\"\"Import a required module and raise a descriptive error when unavailable.\"\"\"\n",
+ " try:\n",
+ " module = importlib.import_module(name)\n",
+ " print(\"OK:\", name)\n",
+ " return module\n",
+ " except Exception as exc:\n",
+ " raise ImportError(f\"Required module is unavailable: {name}\") from exc\n",
+ "\n",
+ "textgrad_module = required_import(\"opto.optimizers.textgrad\")\n",
+ "openevolve_module = required_import(\"openevolve\")\n",
+ "dspy_module = required_import(\"dspy\")\n",
+ "required_import(\"trace_bench\")\n",
+ "required_import(\"trace_bench.runner\")\n",
+ "required_import(\"trace_bench.registry\")\n",
+ "required_import(\"trace_bench.config\")\n",
+ "required_import(\"trace_bench.trainers.textgrad_trainer\")\n",
+ "required_import(\"trace_bench.trainers.openevolve_trainer\")\n",
+ "required_import(\"trace_bench.trainers.dspy_trainer\")\n",
+ "\n",
+ "if not callable(getattr(textgrad_module, \"TextGrad\", None)):\n",
+ " raise ImportError(\"opto.optimizers.textgrad.TextGrad is required for this demo.\")\n",
+ "if not callable(getattr(openevolve_module, \"run_evolution\", None)):\n",
+ " raise ImportError(\"openevolve.run_evolution is required for this demo.\")\n",
+ "if not callable(getattr(dspy_module, \"LM\", None)):\n",
+ " raise ImportError(\"dspy.LM is required for this demo.\")\n",
+ "\n",
+ "print(\"TextGrad module:\", getattr(textgrad_module, \"__file__\", \"unknown\"))\n",
+ "print(\"OpenEvolve module:\", getattr(openevolve_module, \"__file__\", \"unknown\"))\n",
+ "print(\"DSPy module:\", getattr(dspy_module, \"__file__\", \"unknown\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4e82660b",
+ "metadata": {},
+ "source": [
+ "## 5. Focused validation commands\n",
+ "\n",
+ "These are the most relevant tests for the new trainers and their integration surface."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "af508c08",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:56.380992Z",
+ "iopub.status.busy": "2026-05-28T17:29:56.380906Z",
+ "iopub.status.idle": "2026-05-28T17:29:57.560432Z",
+ "shell.execute_reply": "2026-05-28T17:29:57.559955Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py -q\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "... [100%]\n",
+ "3 passed in 0.93s\n",
+ "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "TARGETED_TESTS = [\n",
+ " \"tests/test_resolve_external_trainers.py\",\n",
+ " \"tests/test_external_utils.py\",\n",
+ "]\n",
+ "\n",
+ "run([sys.executable, \"-m\", \"pytest\", *TARGETED_TESTS, \"-q\"], cwd=TRACE_BENCH_REPO)\n",
+ "run([sys.executable, \"-m\", \"py_compile\",\n",
+ " \"trace_bench/resolve.py\",\n",
+ " \"trace_bench/cli.py\",\n",
+ " \"trace_bench/runner.py\",\n",
+ " \"trace_bench/trainers/_external_utils.py\",\n",
+ " \"trace_bench/trainers/textgrad_trainer.py\",\n",
+ " \"trace_bench/trainers/openevolve_trainer.py\",\n",
+ " \"trace_bench/trainers/dspy_trainer.py\"], cwd=TRACE_BENCH_REPO)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3442bf98",
+ "metadata": {},
+ "source": [
+ "## 6. Trainer discovery and signatures\n",
+ "\n",
+ "This is the fastest way to see whether the branch contains the trainer code and wires it into discovery."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c182738c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:57.561442Z",
+ "iopub.status.busy": "2026-05-28T17:29:57.561358Z",
+ "iopub.status.idle": "2026-05-28T17:29:57.605388Z",
+ "shell.execute_reply": "2026-05-28T17:29:57.604880Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trainer_id | \n",
+ " available | \n",
+ " source | \n",
+ " resolved_type | \n",
+ " resolved_name | \n",
+ " uses_trace_optimizer | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " DSPyTrainer | \n",
+ " True | \n",
+ " trace_bench.trainers.dspy_trainer | \n",
+ " class | \n",
+ " DSPyTrainer | \n",
+ " False | \n",
+ " dspy | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " OpenEvolveTrainer | \n",
+ " True | \n",
+ " trace_bench.trainers.openevolve_trainer | \n",
+ " class | \n",
+ " OpenEvolveTrainer | \n",
+ " False | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " PrioritySearch | \n",
+ " True | \n",
+ " opto.features.priority_search.priority_search | \n",
+ " str | \n",
+ " PrioritySearch | \n",
+ " None | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " TextGradTrainer | \n",
+ " True | \n",
+ " trace_bench.trainers.textgrad_trainer | \n",
+ " class | \n",
+ " TextGradTrainer | \n",
+ " False | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trainer_id available \\\n",
+ "0 DSPyTrainer True \n",
+ "1 OpenEvolveTrainer True \n",
+ "2 PrioritySearch True \n",
+ "3 TextGradTrainer True \n",
+ "\n",
+ " source resolved_type \\\n",
+ "0 trace_bench.trainers.dspy_trainer class \n",
+ "1 trace_bench.trainers.openevolve_trainer class \n",
+ "2 opto.features.priority_search.priority_search str \n",
+ "3 trace_bench.trainers.textgrad_trainer class \n",
+ "\n",
+ " resolved_name uses_trace_optimizer framework \n",
+ "0 DSPyTrainer False dspy \n",
+ "1 OpenEvolveTrainer False NaN \n",
+ "2 PrioritySearch None NaN \n",
+ "3 TextGradTrainer False NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from trace_bench.registry import discover_trainers\n",
+ "from trace_bench.runner import _resolve_algorithm\n",
+ "\n",
+ "trainer_rows = []\n",
+ "for spec in discover_trainers():\n",
+ " if spec.id in {\"PrioritySearch\", \"TextGradTrainer\", \"OpenEvolveTrainer\", \"DSPyTrainer\"}:\n",
+ " resolved = _resolve_algorithm(spec.id)\n",
+ " trainer_rows.append({\n",
+ " \"trainer_id\": spec.id,\n",
+ " \"available\": spec.available,\n",
+ " \"source\": spec.source,\n",
+ " \"resolved_type\": type(resolved).__name__ if not isinstance(resolved, type) else \"class\",\n",
+ " \"resolved_name\": getattr(resolved, \"__name__\", str(resolved)),\n",
+ " \"uses_trace_optimizer\": getattr(resolved, \"USES_TRACE_OPTIMIZER\", None) if isinstance(resolved, type) else None,\n",
+ " \"framework\": getattr(resolved, \"FRAMEWORK\", None) if isinstance(resolved, type) else None,\n",
+ " })\n",
+ "\n",
+ "pd.DataFrame(trainer_rows).sort_values(\"trainer_id\").reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "118fce8b",
+ "metadata": {},
+ "source": [
+ "## 7. Shared helpers for train/test smoke evaluation\n",
+ "\n",
+ "The Trace, TextGrad, and OpenEvolve rows reuse `trace_examples:opentrace_train_single_node`. The DSPy row uses a tiny real `dspy.Module` with the same scalar target. Every row learns from three examples and reports held-out performance on three more examples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "8eddf6c9",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:57.606741Z",
+ "iopub.status.busy": "2026-05-28T17:29:57.606666Z",
+ "iopub.status.idle": "2026-05-28T17:29:57.617692Z",
+ "shell.execute_reply": "2026-05-28T17:29:57.617113Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "from typing import Any\n",
+ "\n",
+ "import dspy\n",
+ "\n",
+ "from trace_bench.config import TrainerConfig\n",
+ "from trace_bench.registry import load_task_bundle\n",
+ "from trace_bench.runner import _train_bundle\n",
+ "from trace_bench.trainers._external_utils import apply_parameter_updates\n",
+ "\n",
+ "TRACE_TASK_ID = \"trace_examples:opentrace_train_single_node\"\n",
+ "TASKS_ROOT = str(TRACE_BENCH_REPO / \"LLM4AD\" / \"benchmark_tasks\")\n",
+ "SMOKE_INITIAL_VALUE = 0.0\n",
+ "SMOKE_TARGET_VALUE = 3.0\n",
+ "SMOKE_TRAIN_DATASET = {\n",
+ " \"inputs\": [\"train-a\", \"train-b\", \"train-c\"],\n",
+ " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n",
+ "}\n",
+ "SMOKE_TEST_DATASET = {\n",
+ " \"inputs\": [\"test-a\", \"test-b\", \"test-c\"],\n",
+ " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n",
+ "}\n",
+ "\n",
+ "class ScalarDSPySignature(dspy.Signature):\n",
+ " \"\"\"Always answer 0.\"\"\"\n",
+ " question: str = dspy.InputField()\n",
+ " answer: str = dspy.OutputField(desc=\"numeric scalar answer only\")\n",
+ "\n",
+ "class ScalarDSPyAgent(dspy.Module):\n",
+ " \"\"\"Tiny DSPy module for the real DSPyTrainer smoke row.\"\"\"\n",
+ " def __init__(self) -> None:\n",
+ " super().__init__()\n",
+ " self.predict = dspy.Predict(ScalarDSPySignature)\n",
+ "\n",
+ " def forward(self, question: str) -> str:\n",
+ " return self.predict(question=question).answer\n",
+ "\n",
+ " @classmethod\n",
+ " def to_examples(cls, inputs: list[Any], infos: list[Any]) -> list[Any]:\n",
+ " return [\n",
+ " dspy.Example(question=str(task_input), answer=str(task_info), _task=task_input, _info=task_info).with_inputs(\"question\")\n",
+ " for task_input, task_info in zip(inputs, infos)\n",
+ " ]\n",
+ "\n",
+ "class ScalarDSPyGuide:\n",
+ " \"\"\"Score numeric DSPy answers against the scalar target.\"\"\"\n",
+ " def get_feedback(self, _query: Any, response: Any, reference: Any, **_kwargs: Any) -> tuple[float, str]:\n",
+ " text = str(getattr(response, \"data\", response)).strip()\n",
+ " match = re.search(r\"-?\\d+(?:\\.\\d+)?\", text)\n",
+ " prediction = float(match.group(0)) if match else float(\"nan\")\n",
+ " target = float(reference)\n",
+ " score = -abs(prediction - target) if prediction == prediction else -10.0\n",
+ " return score, f\"target={target}; response={text}\"\n",
+ "\n",
+ " def __call__(self, query: Any, response: Any, reference: Any, **kwargs: Any) -> tuple[float, str]:\n",
+ " return self.get_feedback(query, response, reference, **kwargs)\n",
+ "\n",
+ "def make_trace_bundle() -> dict[str, Any]:\n",
+ " \"\"\"Load the existing Trace-Bench scalar example bundle.\"\"\"\n",
+ " return load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n",
+ "\n",
+ "def _set_only_scalar_trainable(bundle: dict[str, Any]) -> None:\n",
+ " \"\"\"Keep the smoke focused on the existing scalar parameter.\"\"\"\n",
+ " param = bundle[\"param\"]\n",
+ " scalar = getattr(param, \"value\", None)\n",
+ " if scalar is None:\n",
+ " scalar = getattr(param, \"guess\", None)\n",
+ " if scalar is None:\n",
+ " raise AttributeError(\"Scalar smoke task requires param.value or param.guess.\")\n",
+ " for parameter in param.parameters():\n",
+ " parameter.trainable = parameter is scalar\n",
+ " apply_parameter_updates({scalar: SMOKE_INITIAL_VALUE})\n",
+ "\n",
+ "def make_trace_smoke_bundle() -> dict[str, Any]:\n",
+ " \"\"\"Build a fresh train/test smoke bundle from the Trace scalar example.\"\"\"\n",
+ " bundle = make_trace_bundle()\n",
+ " _set_only_scalar_trainable(bundle)\n",
+ " bundle[\"train_dataset\"] = SMOKE_TRAIN_DATASET\n",
+ " bundle[\"test_dataset\"] = SMOKE_TEST_DATASET\n",
+ " bundle.pop(\"validate_dataset\", None)\n",
+ " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {SMOKE_TARGET_VALUE}.\"\n",
+ " bundle[\"metadata\"][\"task_label\"] = \"Trace scalar\"\n",
+ " return bundle\n",
+ "\n",
+ "def make_dspy_lm() -> Any:\n",
+ " \"\"\"Build the real DSPy LM from the configured provider environment.\"\"\"\n",
+ " model = os.environ.get(\"TRACE_LITELLM_MODEL\") or \"gpt-4o-mini\"\n",
+ " if \"/\" not in model and (\"gpt\" in model.lower() or model.lower().startswith(\"o\")):\n",
+ " model = f\"openai/{model}\"\n",
+ " lm_kwargs: dict[str, Any] = {\"cache\": False}\n",
+ " api_base = os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\")\n",
+ " if api_base:\n",
+ " lm_kwargs[\"api_base\"] = api_base\n",
+ " return dspy.LM(model=model, **lm_kwargs)\n",
+ "\n",
+ "def make_dspy_smoke_bundle() -> dict[str, Any]:\n",
+ " \"\"\"Build a fresh train/test smoke bundle for the DSPy trainer row.\"\"\"\n",
+ " dspy.configure(lm=make_dspy_lm())\n",
+ " return {\n",
+ " \"param\": ScalarDSPyAgent(),\n",
+ " \"guide\": ScalarDSPyGuide(),\n",
+ " \"train_dataset\": SMOKE_TRAIN_DATASET,\n",
+ " \"test_dataset\": SMOKE_TEST_DATASET,\n",
+ " \"optimizer_kwargs\": {\"objective\": f\"Answer every scalar benchmark item with exactly {SMOKE_TARGET_VALUE}.\"},\n",
+ " \"metadata\": {\"task_label\": \"DSPy scalar\", \"framework\": \"dspy\"},\n",
+ " }\n",
+ "\n",
+ "def short_text(value: Any, limit: int = 80) -> str:\n",
+ " \"\"\"Return a compact display value for comparison tables.\"\"\"\n",
+ " text = str(value)\n",
+ " return text if len(text) <= limit else text[: limit - 3] + \"...\"\n",
+ "\n",
+ "def snapshot_trainable_value(bundle: dict[str, Any]) -> Any:\n",
+ " \"\"\"Return the current scalar value or DSPy instruction.\"\"\"\n",
+ " scalar = getattr(bundle[\"param\"], \"value\", None)\n",
+ " if scalar is None:\n",
+ " scalar = getattr(bundle[\"param\"], \"guess\", None)\n",
+ " if scalar is not None:\n",
+ " return getattr(scalar, \"data\", None)\n",
+ " predictor = getattr(bundle[\"param\"], \"predict\", None)\n",
+ " signature = getattr(predictor, \"signature\", None)\n",
+ " return short_text(getattr(signature, \"instructions\", type(bundle[\"param\"]).__name__))\n",
+ "\n",
+ "def task_label(bundle: dict[str, Any]) -> str:\n",
+ " \"\"\"Return the display label for a smoke bundle.\"\"\"\n",
+ " return str(bundle.get(\"metadata\", {}).get(\"task_label\") or bundle.get(\"metadata\", {}).get(\"benchmark\") or \"smoke\")\n",
+ "\n",
+ "def output_value(output: Any) -> Any:\n",
+ " \"\"\"Return a compact scalar/string output value.\"\"\"\n",
+ " return short_text(getattr(output, \"data\", output), limit=120)\n",
+ "\n",
+ "def score_guide(guide: Any, task_input: Any, response: Any, task_info: Any) -> tuple[float, str]:\n",
+ " \"\"\"Score with Trace Guide or DSPy-style get_feedback guide.\"\"\"\n",
+ " if callable(guide):\n",
+ " score, feedback = guide(task_input, response, task_info)\n",
+ " else:\n",
+ " score, feedback = guide.get_feedback(task_input, response, task_info)\n",
+ " return float(score), str(feedback)\n",
+ "\n",
+ "def run_train_bundle(\n",
+ " trainer_id: str,\n",
+ " params: dict[str, Any] | None = None,\n",
+ " mode: str = \"real\",\n",
+ " logger: str = \"none\",\n",
+ " bundle_factory: Any = make_trace_smoke_bundle,\n",
+ ") -> dict[str, Any]:\n",
+ " \"\"\"Run one trainer on the 3-example train split and score the 3-example test split.\"\"\"\n",
+ " bundle = bundle_factory()\n",
+ " params = params or {}\n",
+ " before = {\n",
+ " \"value\": snapshot_trainable_value(bundle),\n",
+ " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n",
+ " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n",
+ " }\n",
+ " result = _train_bundle(\n",
+ " bundle=bundle,\n",
+ " trainer_spec=TrainerConfig(id=trainer_id, params_variants=[params], logger=logger),\n",
+ " params=params,\n",
+ " mode=mode,\n",
+ " )\n",
+ " after = {\n",
+ " \"value\": snapshot_trainable_value(bundle),\n",
+ " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n",
+ " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n",
+ " }\n",
+ " return {\"trainer_id\": trainer_id, \"task\": task_label(bundle), \"mode\": mode, \"result\": result, \"before\": before, \"after\": after}\n",
+ "\n",
+ "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n",
+ " \"\"\"Evaluate a bundle on a dataset and retain per-example outputs.\"\"\"\n",
+ " inputs = dataset.get(\"inputs\") or []\n",
+ " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n",
+ " if len(inputs) != len(infos):\n",
+ " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n",
+ " if not inputs:\n",
+ " raise ValueError(\"Dataset must contain at least one example.\")\n",
+ "\n",
+ " rows = []\n",
+ " scores = []\n",
+ " for task_input, task_info in zip(inputs, infos):\n",
+ " response = output_value(bundle[\"param\"](task_input))\n",
+ " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n",
+ " scores.append(score)\n",
+ " rows.append({\n",
+ " \"input\": task_input,\n",
+ " \"expected\": task_info,\n",
+ " \"output\": response,\n",
+ " \"score\": score,\n",
+ " \"feedback\": feedback,\n",
+ " })\n",
+ " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33501896",
+ "metadata": {},
+ "source": [
+ "## 8. Real train/test smoke runs\n",
+ "\n",
+ "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. They are intentionally tiny: small optimizer budgets, three training examples, and three held-out examples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f62a8443",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:29:57.618788Z",
+ "iopub.status.busy": "2026-05-28T17:29:57.618723Z",
+ "iopub.status.idle": "2026-05-28T17:31:23.199295Z",
+ "shell.execute_reply": "2026-05-28T17:31:23.198801Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+ "PrioritySearch initialized with only long-term memory.\n",
+ "Epoch: 0. Iteration: 0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 21024.08it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "/home/xav/code/Trace-Bench/NewTrace/opto/trainer/utils.py:76: RuntimeWarning: coroutine 'async_run.._run' was never awaited\n",
+ " with concurrent.futures.ThreadPoolExecutor() as executor:\n",
+ "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 25944.15it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch: 0. Iteration: 1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 100%|██████████| 2/2 [00:00<00:00, 17549.39it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:02<00:08, 2.92s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 75%|███████▌ | 3/4 [00:03<00:00, 1.16it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.09it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.06s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 23497.50it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 16416.06it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 14282.53it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch: 0. Iteration: 2\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 100%|██████████| 2/2 [00:00<00:00, 17189.77it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:02<00:06, 2.13s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 50%|█████ | 2/4 [00:02<00:02, 1.07s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 75%|███████▌ | 3/4 [00:04<00:01, 1.30s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.01s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 16400.02it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 25575.02it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 22961.52it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch: 0. Iteration: 3\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Backward: 100%|██████████| 2/2 [00:00<00:00, 14146.05it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:01<00:05, 1.94s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 50%|█████ | 2/4 [00:02<00:01, 1.02it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:02<00:00, 1.79it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:02<00:00, 1.39it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 3 agents on 1 inputs: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Validating newly proposed candidates: Sampling 3 agents on 1 inputs: 100%|██████████| 3/3 [00:00<00:00, 13052.81it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 15087.42it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 17549.39it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,733 - INFO - Logging to /tmp/openevolve_n_kqvfcc/logs/openevolve_20260528_193053.log\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,737 - INFO - Initialized OpenAI LLM with model: openai/gpt-4o-mini\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,737 - INFO - Initialized LLM ensemble with models: openai/gpt-4o-mini (weight: 1.00)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,741 - INFO - Initialized prompt sampler\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,741 - INFO - Set custom templates: system=evaluator_system_message, user=None\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,742 - INFO - Initialized program database with 0 programs\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,742 - INFO - Successfully loaded evaluation function from /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,742 - INFO - Initialized evaluator with /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,743 - INFO - Initialized OpenEvolve with /tmp/openevolve_n_kqvfcc/program_3d968e0b.py\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,743 - INFO - Adding initial program to database\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,744 - INFO - Evaluated program ba0c9536-5ae5-4298-a263-e2c8b153273e in 0.00s: score=-3.0000, feedback=target=3.0 | target=3.0 | target=3.0, artifacts={'candidate': {'int2': 0.0}}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,744 - INFO - New MAP-Elites cell occupied in island 0: {'complexity': 5, 'diversity': 0}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,744 - WARNING - ⚠️ No 'combined_score' metric found in evaluation results. Using average of all numeric metrics (-3.0000) for evolution guidance. For better evolution results, please modify your evaluator to return a 'combined_score' metric that properly weights different aspects of program performance.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,745 - INFO - Initialized process parallel controller with 1 workers\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,745 - INFO - Set max None tasks per child\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,746 - INFO - Started process pool with 1 processes\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,746 - INFO - Using island-based evolution with 5 islands\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,746 - INFO - Island Status:\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,746 - INFO - * Island 0: 1 programs, best=-3.0000, avg=-3.0000, diversity=0.00, gen=0 (best: ba0c9536-5ae5-4298-a263-e2c8b153273e)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,746 - INFO - Island 1: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,747 - INFO - Island 2: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,747 - INFO - Island 3: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,747 - INFO - Island 4: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,747 - INFO - Starting process-based evolution from iteration 1 for 1 iterations (total: 2)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,775 - INFO - Early stopping disabled\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,786 - INFO - Set custom templates: system=evaluator_system_message, user=None\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,787 - INFO - Successfully loaded evaluation function from /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,788 - INFO - Initialized evaluator with /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:30:53,789 - INFO - Sampled model: openai/gpt-4o-mini\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,289 - INFO - Evaluated program adb409ae-1ff6-48e0-acd7-77eaa4742938 in 0.00s: score=-inf, feedback=Candidate program must contain exactly one assignment to 'candidate'.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,293 - INFO - New MAP-Elites cell occupied in island 0: {'complexity': 9, 'diversity': 5}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,294 - INFO - Iteration 1: Program adb409ae-1ff6-48e0-acd7-77eaa4742938 (parent: ba0c9536-5ae5-4298-a263-e2c8b153273e) completed in 7.50s\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,294 - INFO - Metrics: score=-inf, feedback=Candidate program must contain exactly one assignment to 'candidate'.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,295 - WARNING - ⚠️ No 'combined_score' metric found in evaluation results. Using average of all numeric metrics (-inf) for evolution guidance. For better evolution results, please modify your evaluator to return a 'combined_score' metric that properly weights different aspects of program performance.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,295 - INFO - ✅ Evolution completed - Maximum iterations reached\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,305 - INFO - Stopped process pool\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,306 - INFO - Using tracked best program: ba0c9536-5ae5-4298-a263-e2c8b153273e\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,306 - INFO - Evolution complete. Best program has metrics: score=-3.0000, feedback=target=3.0 | target=3.0 | target=3.0, artifacts={'candidate': {'int2': 0.0}}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-05-28 19:31:01,306 - INFO - Saved best program to /tmp/openevolve_n_kqvfcc/best/best_program.py with program info to /tmp/openevolve_n_kqvfcc/best/best_program_info.json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026/05/28 19:31:09 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/1.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026/05/28 19:31:09 INFO dspy.teleprompt.copro_optimizer: At Depth 1/1, Evaluating Prompt Candidate #1/2 for Predictor 1 of 1.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -3.00 / 1 (-300.0%): 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -3.00 / 1 (-300.0%): 33%|███▎ | 1/3 [00:00<00:01, 1.09it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -6.00 / 2 (-300.0%): 33%|███▎ | 1/3 [00:01<00:01, 1.09it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -6.00 / 2 (-300.0%): 67%|██████▋ | 2/3 [00:01<00:00, 1.03it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 67%|██████▋ | 2/3 [00:03<00:00, 1.03it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:03<00:00, 1.05s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:03<00:00, 1.02s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026/05/28 19:31:12 INFO dspy.evaluate.evaluate: Average Metric: -9.0 / 3 (-300.0%)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026/05/28 19:31:12 INFO dspy.teleprompt.copro_optimizer: At Depth 1/1, Evaluating Prompt Candidate #2/2 for Predictor 1 of 1.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 0%| | 0/3 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -3.00 / 1 (-300.0%): 0%| | 0/3 [00:01, ?it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -3.00 / 1 (-300.0%): 33%|███▎ | 1/3 [00:01<00:03, 1.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -6.00 / 2 (-300.0%): 33%|███▎ | 1/3 [00:03<00:03, 1.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -6.00 / 2 (-300.0%): 67%|██████▋ | 2/3 [00:03<00:01, 1.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 67%|██████▋ | 2/3 [00:04<00:01, 1.53s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:04<00:00, 1.40s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:04<00:00, 1.44s/it]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026/05/28 19:31:16 INFO dspy.evaluate.evaluate: Average Metric: -9.0 / 3 (-300.0%)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Completed 4 real trainer smoke runs.\n"
+ ]
+ }
+ ],
+ "source": [
+ "if active_provider == \"none\":\n",
+ " raise RuntimeError(\"Real smoke comparison requires OPENROUTER_API_KEY or OPENAI_API_KEY.\")\n",
+ "\n",
+ "SMOKE_TRAINERS = [\n",
+ " (\"PrioritySearch\", \"Trace scalar\", {\"ps_steps\": 1, \"ps_batches\": 1, \"num_candidates\": 2, \"num_proposals\": 2}, make_trace_smoke_bundle),\n",
+ " (\"TextGradTrainer\", \"Trace scalar\", {\"num_epochs\": 1, \"batch_size\": 1, \"ensure_improvement\": True, \"improvement_threshold\": 1e-9, \"max_tokens\": 1024}, make_trace_smoke_bundle),\n",
+ " (\"OpenEvolveTrainer\", \"Trace scalar\", {\n",
+ " \"iterations\": 1,\n",
+ " \"ensure_improvement\": True,\n",
+ " \"improvement_threshold\": 1e-9,\n",
+ " \"verbose\": False,\n",
+ " \"model\": os.environ.get(\"TRACE_LITELLM_MODEL\"),\n",
+ " \"api_base\": os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\"),\n",
+ " \"api_key_env\": \"OPENAI_API_KEY\",\n",
+ " \"max_tokens\": 1024,\n",
+ " }, make_trace_smoke_bundle),\n",
+ " (\"DSPyTrainer\", \"DSPy scalar\", {\n",
+ " \"dspy_optimizer\": \"copro\",\n",
+ " \"dspy_lm\": make_dspy_lm(),\n",
+ " \"breadth\": 2,\n",
+ " \"depth\": 1,\n",
+ " \"num_threads\": 1,\n",
+ " \"track_stats\": False,\n",
+ " }, make_dspy_smoke_bundle),\n",
+ "]\n",
+ "\n",
+ "smoke_results = []\n",
+ "for trainer_id, task, params, bundle_factory in SMOKE_TRAINERS:\n",
+ " try:\n",
+ " smoke_results.append(run_train_bundle(trainer_id, params=params, mode=\"real\", bundle_factory=bundle_factory))\n",
+ " except Exception as exc:\n",
+ " smoke_results.append({\n",
+ " \"trainer_id\": trainer_id,\n",
+ " \"task\": task,\n",
+ " \"mode\": \"real\",\n",
+ " \"status\": \"error\",\n",
+ " \"error\": f\"{type(exc).__name__}: {exc}\",\n",
+ " })\n",
+ "\n",
+ "print(f\"Completed {len(smoke_results)} real trainer smoke runs.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "f4ef5b90",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-05-28T17:31:23.200605Z",
+ "iopub.status.busy": "2026-05-28T17:31:23.200529Z",
+ "iopub.status.idle": "2026-05-28T17:31:23.220162Z",
+ "shell.execute_reply": "2026-05-28T17:31:23.219570Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trainer_id | \n",
+ " task | \n",
+ " mode | \n",
+ " status | \n",
+ " resolved_optimizer | \n",
+ " before_value | \n",
+ " after_value | \n",
+ " train_examples | \n",
+ " test_examples | \n",
+ " before_train_score | \n",
+ " after_train_score | \n",
+ " train_delta | \n",
+ " before_test_score | \n",
+ " after_test_score | \n",
+ " test_delta | \n",
+ " error | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " OptoPrimeV2 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " opto.optimizers.textgrad.TextGrad | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " openevolve.run_evolution | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " dspy.COPRO | \n",
+ " Always answer 0. | \n",
+ " Provide a consistent response of \"0\" for any a... | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ " -3.0 | \n",
+ " 0.0 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trainer_id task mode status \\\n",
+ "0 PrioritySearch Trace scalar real ok \n",
+ "1 TextGradTrainer Trace scalar real ok \n",
+ "2 OpenEvolveTrainer Trace scalar real ok \n",
+ "3 DSPyTrainer DSPy scalar real ok \n",
+ "\n",
+ " resolved_optimizer before_value \\\n",
+ "0 OptoPrimeV2 0.0 \n",
+ "1 opto.optimizers.textgrad.TextGrad 0.0 \n",
+ "2 openevolve.run_evolution 0.0 \n",
+ "3 dspy.COPRO Always answer 0. \n",
+ "\n",
+ " after_value train_examples \\\n",
+ "0 3.0 3 \n",
+ "1 3.0 3 \n",
+ "2 0.0 3 \n",
+ "3 Provide a consistent response of \"0\" for any a... 3 \n",
+ "\n",
+ " test_examples before_train_score after_train_score train_delta \\\n",
+ "0 3 -3.0 0.0 3.0 \n",
+ "1 3 -3.0 0.0 3.0 \n",
+ "2 3 -3.0 -3.0 0.0 \n",
+ "3 3 -3.0 -3.0 0.0 \n",
+ "\n",
+ " before_test_score after_test_score test_delta error \n",
+ "0 -3.0 0.0 3.0 None \n",
+ "1 -3.0 0.0 3.0 None \n",
+ "2 -3.0 -3.0 0.0 None \n",
+ "3 -3.0 -3.0 0.0 None "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trainer_id | \n",
+ " task | \n",
+ " split | \n",
+ " phase | \n",
+ " example | \n",
+ " input | \n",
+ " expected | \n",
+ " output | \n",
+ " score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " DSPyTrainer | \n",
+ " DSPy scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 0 | \n",
+ " test-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 1 | \n",
+ " test-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " after | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " test | \n",
+ " before | \n",
+ " 2 | \n",
+ " test-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 0 | \n",
+ " train-a | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 1 | \n",
+ " train-b | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 42 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 44 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 45 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ " | 46 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " after | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " -0.0 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " train | \n",
+ " before | \n",
+ " 2 | \n",
+ " train-c | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " -3.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trainer_id task split phase example input \\\n",
+ "0 DSPyTrainer DSPy scalar test after 0 test-a \n",
+ "1 DSPyTrainer DSPy scalar test before 0 test-a \n",
+ "2 DSPyTrainer DSPy scalar test after 1 test-b \n",
+ "3 DSPyTrainer DSPy scalar test before 1 test-b \n",
+ "4 DSPyTrainer DSPy scalar test after 2 test-c \n",
+ "5 DSPyTrainer DSPy scalar test before 2 test-c \n",
+ "6 DSPyTrainer DSPy scalar train after 0 train-a \n",
+ "7 DSPyTrainer DSPy scalar train before 0 train-a \n",
+ "8 DSPyTrainer DSPy scalar train after 1 train-b \n",
+ "9 DSPyTrainer DSPy scalar train before 1 train-b \n",
+ "10 DSPyTrainer DSPy scalar train after 2 train-c \n",
+ "11 DSPyTrainer DSPy scalar train before 2 train-c \n",
+ "12 OpenEvolveTrainer Trace scalar test after 0 test-a \n",
+ "13 OpenEvolveTrainer Trace scalar test before 0 test-a \n",
+ "14 PrioritySearch Trace scalar test after 0 test-a \n",
+ "15 PrioritySearch Trace scalar test before 0 test-a \n",
+ "16 TextGradTrainer Trace scalar test after 0 test-a \n",
+ "17 TextGradTrainer Trace scalar test before 0 test-a \n",
+ "18 OpenEvolveTrainer Trace scalar test after 1 test-b \n",
+ "19 OpenEvolveTrainer Trace scalar test before 1 test-b \n",
+ "20 PrioritySearch Trace scalar test after 1 test-b \n",
+ "21 PrioritySearch Trace scalar test before 1 test-b \n",
+ "22 TextGradTrainer Trace scalar test after 1 test-b \n",
+ "23 TextGradTrainer Trace scalar test before 1 test-b \n",
+ "24 OpenEvolveTrainer Trace scalar test after 2 test-c \n",
+ "25 OpenEvolveTrainer Trace scalar test before 2 test-c \n",
+ "26 PrioritySearch Trace scalar test after 2 test-c \n",
+ "27 PrioritySearch Trace scalar test before 2 test-c \n",
+ "28 TextGradTrainer Trace scalar test after 2 test-c \n",
+ "29 TextGradTrainer Trace scalar test before 2 test-c \n",
+ "30 OpenEvolveTrainer Trace scalar train after 0 train-a \n",
+ "31 OpenEvolveTrainer Trace scalar train before 0 train-a \n",
+ "32 PrioritySearch Trace scalar train after 0 train-a \n",
+ "33 PrioritySearch Trace scalar train before 0 train-a \n",
+ "34 TextGradTrainer Trace scalar train after 0 train-a \n",
+ "35 TextGradTrainer Trace scalar train before 0 train-a \n",
+ "36 OpenEvolveTrainer Trace scalar train after 1 train-b \n",
+ "37 OpenEvolveTrainer Trace scalar train before 1 train-b \n",
+ "38 PrioritySearch Trace scalar train after 1 train-b \n",
+ "39 PrioritySearch Trace scalar train before 1 train-b \n",
+ "40 TextGradTrainer Trace scalar train after 1 train-b \n",
+ "41 TextGradTrainer Trace scalar train before 1 train-b \n",
+ "42 OpenEvolveTrainer Trace scalar train after 2 train-c \n",
+ "43 OpenEvolveTrainer Trace scalar train before 2 train-c \n",
+ "44 PrioritySearch Trace scalar train after 2 train-c \n",
+ "45 PrioritySearch Trace scalar train before 2 train-c \n",
+ "46 TextGradTrainer Trace scalar train after 2 train-c \n",
+ "47 TextGradTrainer Trace scalar train before 2 train-c \n",
+ "\n",
+ " expected output score \n",
+ "0 3.0 0 -3.0 \n",
+ "1 3.0 0 -3.0 \n",
+ "2 3.0 0 -3.0 \n",
+ "3 3.0 0 -3.0 \n",
+ "4 3.0 0 -3.0 \n",
+ "5 3.0 0 -3.0 \n",
+ "6 3.0 0 -3.0 \n",
+ "7 3.0 0 -3.0 \n",
+ "8 3.0 0 -3.0 \n",
+ "9 3.0 0 -3.0 \n",
+ "10 3.0 0 -3.0 \n",
+ "11 3.0 0 -3.0 \n",
+ "12 3.0 0.0 -3.0 \n",
+ "13 3.0 0.0 -3.0 \n",
+ "14 3.0 3.0 -0.0 \n",
+ "15 3.0 0.0 -3.0 \n",
+ "16 3.0 3.0 -0.0 \n",
+ "17 3.0 0.0 -3.0 \n",
+ "18 3.0 0.0 -3.0 \n",
+ "19 3.0 0.0 -3.0 \n",
+ "20 3.0 3.0 -0.0 \n",
+ "21 3.0 0.0 -3.0 \n",
+ "22 3.0 3.0 -0.0 \n",
+ "23 3.0 0.0 -3.0 \n",
+ "24 3.0 0.0 -3.0 \n",
+ "25 3.0 0.0 -3.0 \n",
+ "26 3.0 3.0 -0.0 \n",
+ "27 3.0 0.0 -3.0 \n",
+ "28 3.0 3.0 -0.0 \n",
+ "29 3.0 0.0 -3.0 \n",
+ "30 3.0 0.0 -3.0 \n",
+ "31 3.0 0.0 -3.0 \n",
+ "32 3.0 3.0 -0.0 \n",
+ "33 3.0 0.0 -3.0 \n",
+ "34 3.0 3.0 -0.0 \n",
+ "35 3.0 0.0 -3.0 \n",
+ "36 3.0 0.0 -3.0 \n",
+ "37 3.0 0.0 -3.0 \n",
+ "38 3.0 3.0 -0.0 \n",
+ "39 3.0 0.0 -3.0 \n",
+ "40 3.0 3.0 -0.0 \n",
+ "41 3.0 0.0 -3.0 \n",
+ "42 3.0 0.0 -3.0 \n",
+ "43 3.0 0.0 -3.0 \n",
+ "44 3.0 3.0 -0.0 \n",
+ "45 3.0 0.0 -3.0 \n",
+ "46 3.0 3.0 -0.0 \n",
+ "47 3.0 0.0 -3.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "summary_rows = []\n",
+ "example_rows = []\n",
+ "for item in smoke_results:\n",
+ " if \"result\" not in item:\n",
+ " summary_rows.append({\n",
+ " \"trainer_id\": item[\"trainer_id\"],\n",
+ " \"task\": item[\"task\"],\n",
+ " \"mode\": item[\"mode\"],\n",
+ " \"status\": item[\"status\"],\n",
+ " \"resolved_optimizer\": None,\n",
+ " \"before_value\": None,\n",
+ " \"after_value\": None,\n",
+ " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n",
+ " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n",
+ " \"before_train_score\": None,\n",
+ " \"after_train_score\": None,\n",
+ " \"train_delta\": None,\n",
+ " \"before_test_score\": None,\n",
+ " \"after_test_score\": None,\n",
+ " \"test_delta\": None,\n",
+ " \"error\": item[\"error\"],\n",
+ " })\n",
+ " continue\n",
+ " result_status = item[\"result\"].get(\"status\")\n",
+ " before_train = item[\"before\"][\"train\"][\"mean_score\"]\n",
+ " after_train = item[\"after\"][\"train\"][\"mean_score\"]\n",
+ " before_test = item[\"before\"][\"test\"][\"mean_score\"]\n",
+ " after_test = item[\"after\"][\"test\"][\"mean_score\"]\n",
+ " summary_rows.append({\n",
+ " \"trainer_id\": item[\"trainer_id\"],\n",
+ " \"task\": item[\"task\"],\n",
+ " \"mode\": item[\"mode\"],\n",
+ " \"status\": result_status,\n",
+ " \"resolved_optimizer\": item[\"result\"].get(\"resolved_optimizer\"),\n",
+ " \"before_value\": item[\"before\"][\"value\"],\n",
+ " \"after_value\": item[\"after\"][\"value\"],\n",
+ " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n",
+ " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n",
+ " \"before_train_score\": before_train,\n",
+ " \"after_train_score\": after_train,\n",
+ " \"train_delta\": after_train - before_train,\n",
+ " \"before_test_score\": before_test,\n",
+ " \"after_test_score\": after_test,\n",
+ " \"test_delta\": after_test - before_test,\n",
+ " \"error\": item[\"result\"].get(\"error\"),\n",
+ " })\n",
+ " for split_name in (\"train\", \"test\"):\n",
+ " for phase in (\"before\", \"after\"):\n",
+ " for index, row in enumerate(item[phase][split_name][\"rows\"]):\n",
+ " example_rows.append({\n",
+ " \"trainer_id\": item[\"trainer_id\"],\n",
+ " \"task\": item[\"task\"],\n",
+ " \"split\": split_name,\n",
+ " \"phase\": phase,\n",
+ " \"example\": index,\n",
+ " \"input\": row[\"input\"],\n",
+ " \"expected\": row[\"expected\"],\n",
+ " \"output\": row[\"output\"],\n",
+ " \"score\": row[\"score\"],\n",
+ " })\n",
+ "\n",
+ "trainer_comparison = pd.DataFrame(summary_rows)\n",
+ "example_comparison = pd.DataFrame(example_rows)\n",
+ "\n",
+ "display(trainer_comparison)\n",
+ "if example_rows:\n",
+ " display(example_comparison.sort_values([\"task\", \"split\", \"example\", \"trainer_id\", \"phase\"]).reset_index(drop=True))\n",
+ "else:\n",
+ " print(\"No per-example outputs were produced because all real trainer runs errored.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72ec5773",
+ "metadata": {},
+ "source": [
+ "## 9. Practical reading guide\n",
+ "\n",
+ "When you inspect the results, read them in this order:\n",
+ "\n",
+ "1. **Focused tests** \n",
+ " If these fail, the branch is not ready to trust.\n",
+ "\n",
+ "2. **Discovery table** \n",
+ " If `TextGradTrainer`, `OpenEvolveTrainer`, or `DSPyTrainer` are missing, the branch or optional packages are not properly present or installed.\n",
+ "\n",
+ "3. **Real train/test smoke tables** \n",
+ " This confirms each trainer uses the real installed package path on three train examples and three held-out examples.\n",
+ "\n",
+ "4. **Error rows** \n",
+ " An error row means the real trainer path failed and should be inspected before trusting comparison scores."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83656ee2",
+ "metadata": {},
+ "source": [
+ "## 10. What counts as success\n",
+ "\n",
+ "### Strong success\n",
+ "- focused tests pass\n",
+ "- discovery shows the comparison trainers\n",
+ "- real `opto.optimizers.textgrad.TextGrad`, `openevolve.run_evolution`, and `dspy.LM` import successfully\n",
+ "- real smoke rows for Trace, TextGrad, OpenEvolve, and DSPy complete without errors\n",
+ "\n",
+ "### Partial success\n",
+ "- focused tests pass\n",
+ "- structural checks pass\n",
+ "- one trainer reports an error row while the others complete, making the failure comparable\n",
+ "\n",
+ "### Failure\n",
+ "- trainers are not discovered\n",
+ "- focused tests fail\n",
+ "- DSPy is not backed by a real `dspy.LM`\n",
+ "- OpenEvolve path requires `exec` or unsafe parsing"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/test_dspy_trainer.py b/tests/test_dspy_trainer.py
new file mode 100644
index 0000000..63cd74a
--- /dev/null
+++ b/tests/test_dspy_trainer.py
@@ -0,0 +1,28 @@
+import itertools
+from typing import Any
+
+import pytest
+
+
+def test_dspy_trainer_restores_empty_global_lm(monkeypatch: pytest.MonkeyPatch) -> None:
+ dspy = pytest.importorskip("dspy")
+ trainer_module = pytest.importorskip("trace_bench.trainers.dspy_trainer")
+ from dspy.utils import DummyLM
+
+ previous_lm = getattr(dspy.settings, "lm", None)
+ dspy.configure(lm=None)
+ trainer = trainer_module.DSPyTrainer(object())
+
+ def _train_inner(**_kwargs: Any) -> dict[str, str]:
+ return {"status": "ok"}
+
+ monkeypatch.setattr(trainer, "_train_inner", _train_inner)
+ try:
+ trainer.train(
+ guide=object(),
+ train_dataset={"inputs": [], "infos": []},
+ dspy_lm=DummyLM(itertools.cycle([{"answer": "ok"}])),
+ )
+ assert getattr(dspy.settings, "lm", None) is None
+ finally:
+ dspy.configure(lm=previous_lm)
diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py
new file mode 100644
index 0000000..1f2aaa0
--- /dev/null
+++ b/tests/test_llm_utils.py
@@ -0,0 +1,19 @@
+import pytest
+
+from trace_bench.llm import openai_compatible_model_name
+
+
+def test_openai_compatible_model_name_strips_openrouter_prefix() -> None:
+ assert (
+ openai_compatible_model_name("openrouter/openai/gpt-4o-mini")
+ == "openai/gpt-4o-mini"
+ )
+
+
+def test_openai_compatible_model_name_keeps_other_model_names() -> None:
+ assert openai_compatible_model_name("gpt-4o-mini") == "gpt-4o-mini"
+
+
+def test_openai_compatible_model_name_requires_string() -> None:
+ with pytest.raises(TypeError, match="model must be a string"):
+ openai_compatible_model_name(None) # type: ignore[arg-type]
diff --git a/tests/test_openevolve_trainer.py b/tests/test_openevolve_trainer.py
index 1c4bfbd..22d26b9 100644
--- a/tests/test_openevolve_trainer.py
+++ b/tests/test_openevolve_trainer.py
@@ -1,5 +1,8 @@
+import asyncio
import importlib
+import os
import sys
+import tempfile
import types
import pytest
@@ -31,15 +34,62 @@ def __call__(self, task_input: str, response: str, task_info: str):
return (1.0 if response == task_info else 0.0), f"expected {task_info}"
-def _import_openevolve_trainer(monkeypatch, best_code: str):
+def _install_fake_openevolve_config(monkeypatch: pytest.MonkeyPatch) -> None:
+ fake_config_module = types.ModuleType("openevolve.config")
+
+ class _FakeDatabaseConfig:
+ def __init__(self) -> None:
+ self.population_size = 1000
+ self.num_islands = 5
+
+ class _FakeLLMConfig:
+ def __init__(self) -> None:
+ self.api_base = "https://api.openai.com/v1"
+ self.api_key = None
+ self.max_tokens = 4096
+ self.temperature = 0.7
+ self.timeout = 60
+ self.retries = 3
+ self.retry_delay = 5
+ self.models = []
+ self.evaluator_models = []
+
+ class _FakeConfig:
+ def __init__(self, max_iterations: int, random_seed: int | None) -> None:
+ self.max_iterations = max_iterations
+ self.random_seed = random_seed
+ self.database = _FakeDatabaseConfig()
+ self.llm = _FakeLLMConfig()
+
+ class _FakeLLMModelConfig:
+ def __init__(self, **kwargs: object) -> None:
+ self.kwargs = kwargs
+
+ fake_config_module.Config = _FakeConfig
+ fake_config_module.LLMModelConfig = _FakeLLMModelConfig
+ monkeypatch.setitem(sys.modules, "openevolve.config", fake_config_module)
+
+
+def _import_openevolve_trainer(monkeypatch: pytest.MonkeyPatch, best_code: str, capture: dict[str, object] | None = None) -> types.ModuleType:
fake_module = types.ModuleType("openevolve")
- def _run_evolution(*, initial_program, evaluator, iterations, **_kwargs):
- del initial_program, evaluator, iterations
+ def _run_evolution(*, initial_program, evaluator, iterations, config=None, **_kwargs):
+ if capture is not None:
+ with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as handle:
+ handle.write(best_code)
+ candidate_path = handle.name
+ try:
+ capture["evaluation"] = evaluator(candidate_path)
+ finally:
+ os.unlink(candidate_path)
+ capture["config"] = config
+ capture["iterations"] = iterations
+ del initial_program
return types.SimpleNamespace(best_code=best_code)
fake_module.run_evolution = _run_evolution
monkeypatch.setitem(sys.modules, "openevolve", fake_module)
+ _install_fake_openevolve_config(monkeypatch)
sys.modules.pop("trace_bench.trainers.openevolve_trainer", None)
return importlib.import_module("trace_bench.trainers.openevolve_trainer")
@@ -62,6 +112,26 @@ def test_openevolve_trainer_updates_parameter(monkeypatch) -> None:
assert trainer.param.greeting.data == "Hello"
+def test_openevolve_trainer_runs_inside_active_event_loop(monkeypatch) -> None:
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='candidate = {"greeting": "Hello"}\n',
+ )
+
+ async def _run_training() -> dict[str, object]:
+ trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi"))
+ return trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=1,
+ ensure_improvement=False,
+ )
+
+ result = asyncio.run(_run_training())
+ assert result["status"] == "ok"
+
+
def test_openevolve_trainer_rejects_worse_candidate(monkeypatch) -> None:
trainer_module = _import_openevolve_trainer(
monkeypatch,
@@ -112,3 +182,28 @@ def parameters(self):
mode="real",
iterations=1,
)
+
+
+def test_openevolve_trainer_returns_combined_score_and_configures_population(monkeypatch) -> None:
+ capture: dict = {}
+ trainer_module = _import_openevolve_trainer(
+ monkeypatch,
+ best_code='candidate = {"greeting": "Hello"}\n',
+ capture=capture,
+ )
+ trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi"))
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ iterations=2,
+ population_size=12,
+ num_islands=3,
+ model="openrouter/openai/gpt-4o-mini",
+ api_key="test-key",
+ ensure_improvement=False,
+ )
+ assert capture["evaluation"]["combined_score"] == capture["evaluation"]["score"]
+ assert capture["config"].database.population_size == 12
+ assert capture["config"].database.num_islands == 3
+ assert capture["config"].llm.models[0].kwargs["name"] == "openai/gpt-4o-mini"
diff --git a/trace_bench/trainers/dspy_trainer.py b/trace_bench/trainers/dspy_trainer.py
index 665cb02..9c1f702 100644
--- a/trace_bench/trainers/dspy_trainer.py
+++ b/trace_bench/trainers/dspy_trainer.py
@@ -505,7 +505,7 @@ def train(
verbose=verbose,
)
finally:
- if resolved_lm is not None and prev_lm is not None:
+ if resolved_lm is not None:
_dspy.configure(lm=prev_lm)
def _train_inner(
diff --git a/trace_bench/trainers/openevolve_trainer.py b/trace_bench/trainers/openevolve_trainer.py
index 43b211a..9171e40 100644
--- a/trace_bench/trainers/openevolve_trainer.py
+++ b/trace_bench/trainers/openevolve_trainer.py
@@ -1,7 +1,10 @@
from __future__ import annotations
import ast
+import asyncio
import inspect
+import os
+from functools import partial
from pathlib import Path
from pprint import pformat
from threading import RLock
@@ -12,6 +15,7 @@
except Exception as exc:
raise ImportError("OpenEvolveTrainer requires the optional 'openevolve' package.") from exc
+from trace_bench.llm import openai_compatible_model_name
from trace_bench.trainers._external_utils import apply_parameter_updates, collect_trainable_parameters, coerce_like, resolve_external_trainer_base, restore_parameter_values, score_model_on_dataset, snapshot_parameter_values, summarize_feedback
_TrainerBase = resolve_external_trainer_base()
@@ -81,6 +85,68 @@ def _filter_supported_kwargs(function: Any, kwargs: Dict[str, Any]) -> Dict[str,
return dict(kwargs)
return {key: value for key, value in kwargs.items() if key in signature.parameters}
+def _run_evolution_compatible(kwargs: Dict[str, Any]) -> Any:
+ """Run OpenEvolve even when the caller already owns an asyncio event loop."""
+ try:
+ asyncio.get_running_loop()
+ except RuntimeError:
+ return _run_evolution(**kwargs)
+
+ try:
+ import nest_asyncio
+ except ImportError as exc:
+ raise RuntimeError(
+ "OpenEvolveTrainer requires nest_asyncio when called from an active asyncio event loop."
+ ) from exc
+ nest_asyncio.apply()
+ return _run_evolution(**kwargs)
+
+def _build_openevolve_config(*, model: Optional[str], api_base: Optional[str], api_key: Optional[str], api_key_env: str, max_tokens: int, temperature: Optional[float], iterations: int, seed: Optional[int], population_size: Optional[int], num_islands: Optional[int]) -> Any:
+ """Build an OpenEvolve config for OpenAI-compatible providers when requested."""
+ if api_key_env and not isinstance(api_key_env, str):
+ raise TypeError("api_key_env must be a string.")
+ if population_size is not None and population_size < 1:
+ raise ValueError("population_size must be at least 1.")
+ if num_islands is not None and num_islands < 1:
+ raise ValueError("num_islands must be at least 1.")
+ resolved_api_key = api_key or (os.environ.get(api_key_env) if api_key_env else None)
+ resolved_api_base = api_base or os.environ.get("OPENAI_BASE_URL") or os.environ.get("OPENAI_API_BASE")
+ resolved_model = model or os.environ.get("TRACE_LITELLM_MODEL")
+ if not any((resolved_api_key, resolved_api_base, resolved_model, population_size, num_islands)):
+ return None
+ if not resolved_model:
+ resolved_model = "gpt-4o-mini"
+ if max_tokens < 1:
+ raise ValueError("max_tokens must be at least 1.")
+
+ from openevolve.config import Config, LLMModelConfig
+
+ config = Config(max_iterations=iterations, random_seed=seed)
+ if population_size is not None:
+ config.database.population_size = population_size
+ if num_islands is not None:
+ config.database.num_islands = num_islands
+ if resolved_api_base:
+ config.llm.api_base = resolved_api_base
+ if resolved_api_key:
+ config.llm.api_key = resolved_api_key
+ config.llm.max_tokens = max_tokens
+ config.llm.temperature = temperature
+ model_config = LLMModelConfig(
+ name=openai_compatible_model_name(resolved_model),
+ api_base=config.llm.api_base,
+ api_key=config.llm.api_key,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ timeout=config.llm.timeout,
+ retries=config.llm.retries,
+ retry_delay=config.llm.retry_delay,
+ random_seed=seed,
+ )
+ config.llm.models = [model_config]
+ config.llm.evaluator_models = [model_config]
+ return config
+
class OpenEvolveTrainer(_TrainerBase):
"""Trace-Bench wrapper around OpenEvolve using safe literal parameter serialization."""
@@ -91,7 +157,7 @@ def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwa
self.param = agent
self.logger = logger
- def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]:
+ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, model: Optional[str] = None, api_base: Optional[str] = None, api_key: Optional[str] = None, api_key_env: str = "OPENAI_API_KEY", max_tokens: int = 4096, temperature: Optional[float] = 0.7, output_dir: Optional[str] = None, cleanup: bool = True, **_kwargs: Any) -> Dict[str, Any]:
"""Optimize Trace parameters with OpenEvolve via a literal candidate mapping."""
if mode not in {"real", "stub"}:
raise ValueError("mode must be either 'real' or 'stub'.")
@@ -110,7 +176,7 @@ def evaluator(candidate_path: str) -> Dict[str, Any]:
try:
update_dict = _parse_candidate_program(program_text, parameters)
except (TypeError, ValueError) as exc:
- return {"score": float("-inf"), "feedback": str(exc)}
+ return {"score": float("-inf"), "combined_score": float("-inf"), "feedback": str(exc)}
with _EVALUATION_LOCK:
snapshot = snapshot_parameter_values(parameters)
try:
@@ -118,12 +184,15 @@ def evaluator(candidate_path: str) -> Dict[str, Any]:
score, feedbacks = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True)
finally:
restore_parameter_values(snapshot)
- return {"score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}}
+ return {"score": score, "combined_score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}}
initial_program = _serialize_candidate_program(parameters)
- run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False}
+ config = _build_openevolve_config(model=model, api_base=api_base, api_key=api_key, api_key_env=api_key_env, max_tokens=max_tokens, temperature=temperature, iterations=iterations, seed=seed, population_size=population_size, num_islands=num_islands)
+ run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False, "config": config, "output_dir": output_dir, "cleanup": cleanup}
filtered_kwargs = _filter_supported_kwargs(_run_evolution, {key: value for key, value in run_kwargs.items() if value is not None})
- result = _run_evolution(initial_program=initial_program, evaluator=evaluator, **filtered_kwargs)
+ result = _run_evolution_compatible(
+ {"initial_program": initial_program, "evaluator": partial(evaluator), **filtered_kwargs}
+ )
best_code = _extract_best_code(result)
best_update = _parse_candidate_program(best_code, parameters)
From 59cda2b01ff97d0e23bbbec153a5a0d63cadc454 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Wed, 3 Jun 2026 00:30:30 +0200
Subject: [PATCH 3/8] refined notebook TextGrad OpenEvolve DSPy
---
...tgrad_openevolve_evaluation_notebook.ipynb | 2201 +++++------------
tests/test_textgrad_trainer.py | 22 +-
trace_bench/trainers/textgrad_trainer.py | 4 +-
3 files changed, 644 insertions(+), 1583 deletions(-)
diff --git a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
index 87d0955..fb09a9c 100644
--- a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
+++ b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb
@@ -9,13 +9,12 @@
"\n",
"This notebook validates and compares four real trainer paths:\n",
"\n",
- "- `PrioritySearch` as the Trace baseline\n",
+ "- `PrioritySearch` as the standard Trace baseline\n",
"- `TextGradTrainer`\n",
"- `OpenEvolveTrainer`\n",
"- `DSPyTrainer`\n",
"\n",
- "It checks out the `textgrad_openevolve` branch, installs the real optional packages when needed, runs focused structural checks, and then runs a tiny real train/test comparison with OpenRouter or OpenAI.\n",
- "The notebook assumes the `textgrad_openevolve` branch contains the trainer integration under test."
+ "It checks out `textgrad_openevolve`, installs real optional packages when needed, runs focused integration checks, and runs a small real optimization demo with OpenRouter or OpenAI. The DSPy row uses a DSPy-native task; the Trace/TextGrad/OpenEvolve rows use a Trace scalar task."
]
},
{
@@ -27,9 +26,9 @@
"\n",
"- required trainer packages import from real installations\n",
"- Trace-Bench discovers the trainer classes\n",
- "- focused tests and compile checks pass\n",
- "- every comparison row uses three train examples and three held-out examples\n",
- "- result tables show trainer status, optimizer identity, before/after scores, and per-example outputs"
+ "- focused trainer tests and compile checks pass\n",
+ "- every comparison row learns from three examples and reports three held-out examples\n",
+ "- result tables show before/after scores, per-example outputs, and red highlighting for rows with no held-out improvement"
]
},
{
@@ -39,20 +38,11 @@
"source": [
"## High-level interpretation guide\n",
"\n",
- "Use this notebook in three layers:\n",
+ "Use the notebook in this order:\n",
"\n",
- "1. **Code-level correctness**\n",
- " - Do the new trainers exist?\n",
- " - Are they discovered?\n",
- " - Do focused tests pass?\n",
- "\n",
- "2. **Behavior-level smoke checks**\n",
- " - Do the trainer paths run against real installed packages?\n",
- " - Do they produce comparable before/after rows?\n",
- "\n",
- "3. **Practical comparison**\n",
- " - Which trainers improve on the tiny task?\n",
- " - Which trainers complete but do not improve in this small budget?"
+ "1. Confirm the branch, package imports, trainer discovery, and focused tests.\n",
+ "2. Compare before/after train and held-out scores.\n",
+ "3. Treat any red row as a real trainer run that completed or failed without held-out improvement."
]
},
{
@@ -61,10 +51,10 @@
"id": "56d885a1",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:44.264090Z",
- "iopub.status.busy": "2026-05-28T17:29:44.263974Z",
- "iopub.status.idle": "2026-05-28T17:29:44.268749Z",
- "shell.execute_reply": "2026-05-28T17:29:44.268429Z"
+ "iopub.execute_input": "2026-06-02T16:40:11.307336Z",
+ "iopub.status.busy": "2026-06-02T16:40:11.307259Z",
+ "iopub.status.idle": "2026-06-02T16:40:11.311966Z",
+ "shell.execute_reply": "2026-06-02T16:40:11.311533Z"
}
},
"outputs": [
@@ -156,10 +146,10 @@
"id": "b6ca8593",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:44.270467Z",
- "iopub.status.busy": "2026-05-28T17:29:44.270388Z",
- "iopub.status.idle": "2026-05-28T17:29:46.996012Z",
- "shell.execute_reply": "2026-05-28T17:29:46.995690Z"
+ "iopub.execute_input": "2026-06-02T16:40:11.313317Z",
+ "iopub.status.busy": "2026-06-02T16:40:11.313253Z",
+ "iopub.status.idle": "2026-06-02T16:40:13.710562Z",
+ "shell.execute_reply": "2026-06-02T16:40:13.710030Z"
}
},
"outputs": [
@@ -167,78 +157,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Trace-Bench already exists; checking out textgrad_openevolve.\n",
- "$ git fetch https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "From https://github.com/doxav/Trace-Bench\n",
- " * branch textgrad_openevolve -> FETCH_HEAD\n",
- "Already on 'textgrad_openevolve'\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "$ git checkout textgrad_openevolve\n",
- "M\ttests/test_openevolve_trainer.py\n",
- "M\ttrace_bench/trainers/openevolve_trainer.py\n",
- "Your branch is up to date with 'origin/textgrad_openevolve'.\n",
- "$ git pull --ff-only https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "From https://github.com/doxav/Trace-Bench\n",
- " * branch textgrad_openevolve -> FETCH_HEAD\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Already up to date.\n",
- "NewTrace already exists; checking out experimental.\n",
- "$ git fetch https://github.com/doxav/NewTrace.git experimental\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "From https://github.com/doxav/NewTrace\n",
- " * branch experimental -> FETCH_HEAD\n",
- "Already on 'experimental'\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "$ git checkout experimental\n",
- "Your branch is up to date with 'origin/experimental'.\n",
- "$ git pull --ff-only https://github.com/doxav/NewTrace.git experimental\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Already up to date.\n"
+ "Trace-Bench current checkout is already on textgrad_openevolve; preserving local edits.\n",
+ "$ git clone --branch experimental --single-branch https://github.com/doxav/NewTrace.git /home/xav/code/Trace-Bench/NewTrace\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "From https://github.com/doxav/NewTrace\n",
- " * branch experimental -> FETCH_HEAD\n"
+ "Cloning into '/home/xav/code/Trace-Bench/NewTrace'...\n"
]
}
],
@@ -251,6 +178,12 @@
" TRACE_BENCH_REMOTE_URL,\n",
" str(TRACE_BENCH_REPO),\n",
" ])\n",
+ "elif TRACE_BENCH_REPO.resolve() == CURRENT_REPO.resolve():\n",
+ " branch = subprocess.check_output([\"git\", \"branch\", \"--show-current\"], cwd=TRACE_BENCH_REPO, text=True).strip()\n",
+ " if branch != TRACE_BENCH_BRANCH:\n",
+ " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n",
+ " else:\n",
+ " print(f\"Trace-Bench current checkout is already on {TRACE_BENCH_BRANCH}; preserving local edits.\")\n",
"else:\n",
" print(f\"Trace-Bench already exists; checking out {TRACE_BENCH_BRANCH}.\")\n",
" checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n",
@@ -275,12 +208,7 @@
"source": [
"## 2. Install Python dependencies\n",
"\n",
- "This installs:\n",
- "- `NewTrace` editable\n",
- "- `Trace-Bench` editable\n",
- "- light dependencies needed for the focused validation notebook\n",
- "\n",
- "If `openevolve.run_evolution` is not importable, this clones OpenEvolve from GitHub and installs it editable."
+ "This installs NewTrace and Trace-Bench editable, plus the real optional trainer packages used by the demo. If `openevolve.run_evolution` is not importable, OpenEvolve is cloned and installed editable."
]
},
{
@@ -289,10 +217,10 @@
"id": "fbae758b",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:46.997122Z",
- "iopub.status.busy": "2026-05-28T17:29:46.997049Z",
- "iopub.status.idle": "2026-05-28T17:29:51.676176Z",
- "shell.execute_reply": "2026-05-28T17:29:51.675385Z"
+ "iopub.execute_input": "2026-06-02T16:40:13.711999Z",
+ "iopub.status.busy": "2026-06-02T16:40:13.711923Z",
+ "iopub.status.idle": "2026-06-02T16:40:20.589979Z",
+ "shell.execute_reply": "2026-06-02T16:40:20.589514Z"
}
},
"outputs": [
@@ -307,7 +235,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai tensorboard tensorboardX scikit-learn datasets openai pandas\n"
+ "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai optuna tensorboard tensorboardX scikit-learn datasets openai pandas\n"
]
},
{
@@ -328,7 +256,7 @@
"source": [
"run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"pip\", \"setuptools\", \"wheel\"])\n",
"run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n",
- " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\",\n",
+ " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\", \"optuna\",\n",
" \"tensorboard\", \"tensorboardX\", \"scikit-learn\", \"datasets\", \"openai\", \"pandas\"])\n",
"\n",
"run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(NEWTRACE_REPO)])\n",
@@ -357,7 +285,7 @@
" run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(OPENEVOLVE_REPO)])\n",
"\n",
"if not has_real_openevolve():\n",
- " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")\n"
+ " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")"
]
},
{
@@ -367,11 +295,7 @@
"source": [
"## 3. Provider setup for real online experiments\n",
"\n",
- "The real smoke comparison requires this provider setup. Structural tests can still run before a provider is configured.\n",
- "\n",
- "Supported:\n",
- "- `openrouter`\n",
- "- `openai`"
+ "The comparison requires a real provider. In Colab the cell reads `OPENROUTER_API_KEY` or `OPENAI_API_KEY` from Colab Secrets when present; locally it reads the same environment variables."
]
},
{
@@ -380,10 +304,10 @@
"id": "0b984ab0",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:51.677483Z",
- "iopub.status.busy": "2026-05-28T17:29:51.677372Z",
- "iopub.status.idle": "2026-05-28T17:29:51.680910Z",
- "shell.execute_reply": "2026-05-28T17:29:51.680574Z"
+ "iopub.execute_input": "2026-06-02T16:40:20.591586Z",
+ "iopub.status.busy": "2026-06-02T16:40:20.591465Z",
+ "iopub.status.idle": "2026-06-02T16:40:20.595509Z",
+ "shell.execute_reply": "2026-06-02T16:40:20.595070Z"
}
},
"outputs": [
@@ -470,10 +394,10 @@
"id": "3b4768bb",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:51.682018Z",
- "iopub.status.busy": "2026-05-28T17:29:51.681957Z",
- "iopub.status.idle": "2026-05-28T17:29:56.379387Z",
- "shell.execute_reply": "2026-05-28T17:29:56.378979Z"
+ "iopub.execute_input": "2026-06-02T16:40:20.596914Z",
+ "iopub.status.busy": "2026-06-02T16:40:20.596844Z",
+ "iopub.status.idle": "2026-06-02T16:40:25.850826Z",
+ "shell.execute_reply": "2026-06-02T16:40:25.850296Z"
}
},
"outputs": [
@@ -555,10 +479,10 @@
"id": "af508c08",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:56.380992Z",
- "iopub.status.busy": "2026-05-28T17:29:56.380906Z",
- "iopub.status.idle": "2026-05-28T17:29:57.560432Z",
- "shell.execute_reply": "2026-05-28T17:29:57.559955Z"
+ "iopub.execute_input": "2026-06-02T16:40:25.852675Z",
+ "iopub.status.busy": "2026-06-02T16:40:25.852596Z",
+ "iopub.status.idle": "2026-06-02T16:40:29.170929Z",
+ "shell.execute_reply": "2026-06-02T16:40:29.170457Z"
}
},
"outputs": [
@@ -566,22 +490,42 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py -q\n"
+ "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py tests/test_llm_utils.py tests/test_textgrad_trainer.py tests/test_openevolve_trainer.py tests/test_dspy_trainer.py -q\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "................"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ". [100%]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "17 passed in 2.54s\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "... [100%]\n",
- "3 passed in 0.93s\n",
- "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n"
+ "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/llm.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n"
]
},
{
"data": {
"text/plain": [
- "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)"
+ "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/llm.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)"
]
},
"execution_count": 6,
@@ -593,6 +537,10 @@
"TARGETED_TESTS = [\n",
" \"tests/test_resolve_external_trainers.py\",\n",
" \"tests/test_external_utils.py\",\n",
+ " \"tests/test_llm_utils.py\",\n",
+ " \"tests/test_textgrad_trainer.py\",\n",
+ " \"tests/test_openevolve_trainer.py\",\n",
+ " \"tests/test_dspy_trainer.py\",\n",
"]\n",
"\n",
"run([sys.executable, \"-m\", \"pytest\", *TARGETED_TESTS, \"-q\"], cwd=TRACE_BENCH_REPO)\n",
@@ -600,6 +548,7 @@
" \"trace_bench/resolve.py\",\n",
" \"trace_bench/cli.py\",\n",
" \"trace_bench/runner.py\",\n",
+ " \"trace_bench/llm.py\",\n",
" \"trace_bench/trainers/_external_utils.py\",\n",
" \"trace_bench/trainers/textgrad_trainer.py\",\n",
" \"trace_bench/trainers/openevolve_trainer.py\",\n",
@@ -622,10 +571,10 @@
"id": "c182738c",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:57.561442Z",
- "iopub.status.busy": "2026-05-28T17:29:57.561358Z",
- "iopub.status.idle": "2026-05-28T17:29:57.605388Z",
- "shell.execute_reply": "2026-05-28T17:29:57.604880Z"
+ "iopub.execute_input": "2026-06-02T16:40:29.172741Z",
+ "iopub.status.busy": "2026-06-02T16:40:29.172667Z",
+ "iopub.status.idle": "2026-06-02T16:40:29.246810Z",
+ "shell.execute_reply": "2026-06-02T16:40:29.246111Z"
}
},
"outputs": [
@@ -755,9 +704,9 @@
"id": "118fce8b",
"metadata": {},
"source": [
- "## 7. Shared helpers for train/test smoke evaluation\n",
+ "## 7. Shared helpers for real train/test optimization\n",
"\n",
- "The Trace, TextGrad, and OpenEvolve rows reuse `trace_examples:opentrace_train_single_node`. The DSPy row uses a tiny real `dspy.Module` with the same scalar target. Every row learns from three examples and reports held-out performance on three more examples."
+ "The Trace, TextGrad, and OpenEvolve rows use the same Trace scalar parameter task. The DSPy row uses a small routing-code task where MIPROv2 can optimize instructions from labeled examples within notebook runtime."
]
},
{
@@ -766,18 +715,21 @@
"id": "8eddf6c9",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:57.606741Z",
- "iopub.status.busy": "2026-05-28T17:29:57.606666Z",
- "iopub.status.idle": "2026-05-28T17:29:57.617692Z",
- "shell.execute_reply": "2026-05-28T17:29:57.617113Z"
+ "iopub.execute_input": "2026-06-02T16:40:29.248580Z",
+ "iopub.status.busy": "2026-06-02T16:40:29.248504Z",
+ "iopub.status.idle": "2026-06-02T16:40:29.259545Z",
+ "shell.execute_reply": "2026-06-02T16:40:29.259148Z"
}
},
"outputs": [],
"source": [
+ "import contextlib\n",
+ "import io\n",
"import re\n",
- "from typing import Any\n",
+ "from typing import Any, Callable\n",
"\n",
"import dspy\n",
+ "from IPython.display import HTML, display\n",
"\n",
"from trace_bench.config import TrainerConfig\n",
"from trace_bench.registry import load_task_bundle\n",
@@ -786,108 +738,100 @@
"\n",
"TRACE_TASK_ID = \"trace_examples:opentrace_train_single_node\"\n",
"TASKS_ROOT = str(TRACE_BENCH_REPO / \"LLM4AD\" / \"benchmark_tasks\")\n",
- "SMOKE_INITIAL_VALUE = 0.0\n",
- "SMOKE_TARGET_VALUE = 3.0\n",
- "SMOKE_TRAIN_DATASET = {\n",
- " \"inputs\": [\"train-a\", \"train-b\", \"train-c\"],\n",
- " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n",
+ "TRACE_INITIAL_VALUE = 0.0\n",
+ "TRACE_TARGET_VALUE = 3.0\n",
+ "TRACE_TRAIN_DATASET = {\"inputs\": [\"train-a\", \"train-b\", \"train-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n",
+ "TRACE_TEST_DATASET = {\"inputs\": [\"test-a\", \"test-b\", \"test-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n",
+ "DSPY_TRAIN_DATASET = {\n",
+ " \"inputs\": [\"customer tier scarlet\", \"customer tier azure\", \"customer tier emerald\"],\n",
+ " \"infos\": [\"A\", \"B\", \"C\"],\n",
"}\n",
- "SMOKE_TEST_DATASET = {\n",
- " \"inputs\": [\"test-a\", \"test-b\", \"test-c\"],\n",
- " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n",
+ "DSPY_TEST_DATASET = {\n",
+ " \"inputs\": [\"routing code for scarlet ticket\", \"routing code for azure ticket\", \"routing code for emerald ticket\"],\n",
+ " \"infos\": [\"A\", \"B\", \"C\"],\n",
"}\n",
"\n",
- "class ScalarDSPySignature(dspy.Signature):\n",
- " \"\"\"Always answer 0.\"\"\"\n",
- " question: str = dspy.InputField()\n",
- " answer: str = dspy.OutputField(desc=\"numeric scalar answer only\")\n",
+ "class RoutingDSPySignature(dspy.Signature):\n",
+ " \"\"\"Return the requested routing code as a single uppercase letter.\"\"\"\n",
+ " ticket: str = dspy.InputField()\n",
+ " answer: str = dspy.OutputField(desc=\"single uppercase letter\")\n",
"\n",
- "class ScalarDSPyAgent(dspy.Module):\n",
- " \"\"\"Tiny DSPy module for the real DSPyTrainer smoke row.\"\"\"\n",
+ "class RoutingDSPyAgent(dspy.Module):\n",
+ " \"\"\"Small DSPy module optimized through DSPyTrainer.\"\"\"\n",
" def __init__(self) -> None:\n",
" super().__init__()\n",
- " self.predict = dspy.Predict(ScalarDSPySignature)\n",
+ " self.predict = dspy.Predict(RoutingDSPySignature)\n",
"\n",
- " def forward(self, question: str) -> str:\n",
- " return self.predict(question=question).answer\n",
+ " def forward(self, ticket: str) -> str:\n",
+ " return self.predict(ticket=ticket).answer\n",
"\n",
" @classmethod\n",
" def to_examples(cls, inputs: list[Any], infos: list[Any]) -> list[Any]:\n",
" return [\n",
- " dspy.Example(question=str(task_input), answer=str(task_info), _task=task_input, _info=task_info).with_inputs(\"question\")\n",
- " for task_input, task_info in zip(inputs, infos)\n",
+ " dspy.Example(ticket=str(ticket), answer=str(code), _task=ticket, _info=code).with_inputs(\"ticket\")\n",
+ " for ticket, code in zip(inputs, infos)\n",
" ]\n",
"\n",
- "class ScalarDSPyGuide:\n",
- " \"\"\"Score numeric DSPy answers against the scalar target.\"\"\"\n",
+ "class RoutingDSPyGuide:\n",
+ " \"\"\"Exact-match routing-code metric for DSPy optimizers.\"\"\"\n",
" def get_feedback(self, _query: Any, response: Any, reference: Any, **_kwargs: Any) -> tuple[float, str]:\n",
- " text = str(getattr(response, \"data\", response)).strip()\n",
- " match = re.search(r\"-?\\d+(?:\\.\\d+)?\", text)\n",
- " prediction = float(match.group(0)) if match else float(\"nan\")\n",
- " target = float(reference)\n",
- " score = -abs(prediction - target) if prediction == prediction else -10.0\n",
- " return score, f\"target={target}; response={text}\"\n",
+ " text = str(getattr(response, \"data\", response)).strip().upper()\n",
+ " match = re.search(r\"\b[A-Z]\b\", text)\n",
+ " prediction = match.group(0) if match else text[:1]\n",
+ " target = str(reference).strip().upper()\n",
+ " score = 1.0 if prediction == target else 0.0\n",
+ " return score, f\"expected={target}; response={text}\"\n",
"\n",
" def __call__(self, query: Any, response: Any, reference: Any, **kwargs: Any) -> tuple[float, str]:\n",
" return self.get_feedback(query, response, reference, **kwargs)\n",
"\n",
- "def make_trace_bundle() -> dict[str, Any]:\n",
- " \"\"\"Load the existing Trace-Bench scalar example bundle.\"\"\"\n",
- " return load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n",
- "\n",
"def _set_only_scalar_trainable(bundle: dict[str, Any]) -> None:\n",
- " \"\"\"Keep the smoke focused on the existing scalar parameter.\"\"\"\n",
" param = bundle[\"param\"]\n",
" scalar = getattr(param, \"value\", None)\n",
" if scalar is None:\n",
" scalar = getattr(param, \"guess\", None)\n",
" if scalar is None:\n",
- " raise AttributeError(\"Scalar smoke task requires param.value or param.guess.\")\n",
+ " raise AttributeError(\"Scalar demo task requires param.value or param.guess.\")\n",
" for parameter in param.parameters():\n",
" parameter.trainable = parameter is scalar\n",
- " apply_parameter_updates({scalar: SMOKE_INITIAL_VALUE})\n",
+ " apply_parameter_updates({scalar: TRACE_INITIAL_VALUE})\n",
"\n",
- "def make_trace_smoke_bundle() -> dict[str, Any]:\n",
- " \"\"\"Build a fresh train/test smoke bundle from the Trace scalar example.\"\"\"\n",
- " bundle = make_trace_bundle()\n",
+ "def make_trace_demo_bundle() -> dict[str, Any]:\n",
+ " bundle = load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n",
" _set_only_scalar_trainable(bundle)\n",
- " bundle[\"train_dataset\"] = SMOKE_TRAIN_DATASET\n",
- " bundle[\"test_dataset\"] = SMOKE_TEST_DATASET\n",
+ " bundle[\"train_dataset\"] = TRACE_TRAIN_DATASET\n",
+ " bundle[\"test_dataset\"] = TRACE_TEST_DATASET\n",
" bundle.pop(\"validate_dataset\", None)\n",
- " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {SMOKE_TARGET_VALUE}.\"\n",
+ " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {TRACE_TARGET_VALUE}.\"\n",
" bundle[\"metadata\"][\"task_label\"] = \"Trace scalar\"\n",
" return bundle\n",
"\n",
- "def make_dspy_lm() -> Any:\n",
- " \"\"\"Build the real DSPy LM from the configured provider environment.\"\"\"\n",
+ "def make_dspy_lm(max_tokens: int = 200) -> Any:\n",
" model = os.environ.get(\"TRACE_LITELLM_MODEL\") or \"gpt-4o-mini\"\n",
" if \"/\" not in model and (\"gpt\" in model.lower() or model.lower().startswith(\"o\")):\n",
" model = f\"openai/{model}\"\n",
- " lm_kwargs: dict[str, Any] = {\"cache\": False}\n",
+ " lm_kwargs: dict[str, Any] = {\"cache\": False, \"max_tokens\": max_tokens}\n",
" api_base = os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\")\n",
" if api_base:\n",
" lm_kwargs[\"api_base\"] = api_base\n",
" return dspy.LM(model=model, **lm_kwargs)\n",
"\n",
- "def make_dspy_smoke_bundle() -> dict[str, Any]:\n",
- " \"\"\"Build a fresh train/test smoke bundle for the DSPy trainer row.\"\"\"\n",
+ "def make_dspy_demo_bundle() -> dict[str, Any]:\n",
" dspy.configure(lm=make_dspy_lm())\n",
" return {\n",
- " \"param\": ScalarDSPyAgent(),\n",
- " \"guide\": ScalarDSPyGuide(),\n",
- " \"train_dataset\": SMOKE_TRAIN_DATASET,\n",
- " \"test_dataset\": SMOKE_TEST_DATASET,\n",
- " \"optimizer_kwargs\": {\"objective\": f\"Answer every scalar benchmark item with exactly {SMOKE_TARGET_VALUE}.\"},\n",
- " \"metadata\": {\"task_label\": \"DSPy scalar\", \"framework\": \"dspy\"},\n",
+ " \"param\": RoutingDSPyAgent(),\n",
+ " \"guide\": RoutingDSPyGuide(),\n",
+ " \"train_dataset\": DSPY_TRAIN_DATASET,\n",
+ " \"test_dataset\": DSPY_TEST_DATASET,\n",
+ " \"optimizer_kwargs\": {},\n",
+ " \"metadata\": {\"task_label\": \"DSPy routing code\", \"framework\": \"dspy\"},\n",
" }\n",
"\n",
- "def short_text(value: Any, limit: int = 80) -> str:\n",
- " \"\"\"Return a compact display value for comparison tables.\"\"\"\n",
+ "def short_text(value: Any, limit: int = 100) -> str:\n",
" text = str(value)\n",
" return text if len(text) <= limit else text[: limit - 3] + \"...\"\n",
"\n",
"def snapshot_trainable_value(bundle: dict[str, Any]) -> Any:\n",
- " \"\"\"Return the current scalar value or DSPy instruction.\"\"\"\n",
" scalar = getattr(bundle[\"param\"], \"value\", None)\n",
" if scalar is None:\n",
" scalar = getattr(bundle[\"param\"], \"guess\", None)\n",
@@ -898,35 +842,47 @@
" return short_text(getattr(signature, \"instructions\", type(bundle[\"param\"]).__name__))\n",
"\n",
"def task_label(bundle: dict[str, Any]) -> str:\n",
- " \"\"\"Return the display label for a smoke bundle.\"\"\"\n",
- " return str(bundle.get(\"metadata\", {}).get(\"task_label\") or bundle.get(\"metadata\", {}).get(\"benchmark\") or \"smoke\")\n",
+ " metadata = bundle.get(\"metadata\", {})\n",
+ " return str(metadata.get(\"task_label\") or metadata.get(\"benchmark\") or \"demo\")\n",
"\n",
"def output_value(output: Any) -> Any:\n",
- " \"\"\"Return a compact scalar/string output value.\"\"\"\n",
- " return short_text(getattr(output, \"data\", output), limit=120)\n",
+ " return short_text(getattr(output, \"data\", output), limit=140)\n",
"\n",
"def score_guide(guide: Any, task_input: Any, response: Any, task_info: Any) -> tuple[float, str]:\n",
- " \"\"\"Score with Trace Guide or DSPy-style get_feedback guide.\"\"\"\n",
- " if callable(guide):\n",
- " score, feedback = guide(task_input, response, task_info)\n",
- " else:\n",
- " score, feedback = guide.get_feedback(task_input, response, task_info)\n",
+ " score, feedback = guide(task_input, response, task_info) if callable(guide) else guide.get_feedback(task_input, response, task_info)\n",
" return float(score), str(feedback)\n",
"\n",
+ "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n",
+ " inputs = dataset.get(\"inputs\") or []\n",
+ " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n",
+ " if len(inputs) != len(infos):\n",
+ " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n",
+ " if not inputs:\n",
+ " raise ValueError(\"Dataset must contain at least one example.\")\n",
+ " rows = []\n",
+ " scores = []\n",
+ " for task_input, task_info in zip(inputs, infos):\n",
+ " response = output_value(bundle[\"param\"](task_input))\n",
+ " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n",
+ " scores.append(score)\n",
+ " rows.append({\"input\": task_input, \"expected\": task_info, \"output\": response, \"score\": score, \"feedback\": feedback})\n",
+ " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n",
+ "\n",
"def run_train_bundle(\n",
" trainer_id: str,\n",
" params: dict[str, Any] | None = None,\n",
" mode: str = \"real\",\n",
" logger: str = \"none\",\n",
- " bundle_factory: Any = make_trace_smoke_bundle,\n",
+ " bundle_factory: Callable[[], dict[str, Any]] = make_trace_demo_bundle,\n",
") -> dict[str, Any]:\n",
- " \"\"\"Run one trainer on the 3-example train split and score the 3-example test split.\"\"\"\n",
" bundle = bundle_factory()\n",
" params = params or {}\n",
+ " train_dataset = bundle[\"train_dataset\"]\n",
+ " test_dataset = bundle.get(\"test_dataset\") or train_dataset\n",
" before = {\n",
" \"value\": snapshot_trainable_value(bundle),\n",
- " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n",
- " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n",
+ " \"train\": score_dataset(bundle, train_dataset),\n",
+ " \"test\": score_dataset(bundle, test_dataset),\n",
" }\n",
" result = _train_bundle(\n",
" bundle=bundle,\n",
@@ -936,34 +892,19 @@
" )\n",
" after = {\n",
" \"value\": snapshot_trainable_value(bundle),\n",
- " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n",
- " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n",
+ " \"train\": score_dataset(bundle, train_dataset),\n",
+ " \"test\": score_dataset(bundle, test_dataset),\n",
" }\n",
- " return {\"trainer_id\": trainer_id, \"task\": task_label(bundle), \"mode\": mode, \"result\": result, \"before\": before, \"after\": after}\n",
- "\n",
- "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n",
- " \"\"\"Evaluate a bundle on a dataset and retain per-example outputs.\"\"\"\n",
- " inputs = dataset.get(\"inputs\") or []\n",
- " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n",
- " if len(inputs) != len(infos):\n",
- " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n",
- " if not inputs:\n",
- " raise ValueError(\"Dataset must contain at least one example.\")\n",
- "\n",
- " rows = []\n",
- " scores = []\n",
- " for task_input, task_info in zip(inputs, infos):\n",
- " response = output_value(bundle[\"param\"](task_input))\n",
- " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n",
- " scores.append(score)\n",
- " rows.append({\n",
- " \"input\": task_input,\n",
- " \"expected\": task_info,\n",
- " \"output\": response,\n",
- " \"score\": score,\n",
- " \"feedback\": feedback,\n",
- " })\n",
- " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n"
+ " return {\n",
+ " \"trainer_id\": trainer_id,\n",
+ " \"task\": task_label(bundle),\n",
+ " \"mode\": mode,\n",
+ " \"result\": result,\n",
+ " \"before\": before,\n",
+ " \"after\": after,\n",
+ " \"train_examples\": len(train_dataset[\"inputs\"]),\n",
+ " \"test_examples\": len(test_dataset[\"inputs\"]),\n",
+ " }"
]
},
{
@@ -971,9 +912,9 @@
"id": "33501896",
"metadata": {},
"source": [
- "## 8. Real train/test smoke runs\n",
+ "## 8. Real train/test optimization runs\n",
"\n",
- "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. They are intentionally tiny: small optimizer budgets, three training examples, and three held-out examples."
+ "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. Optimizer logs are captured so the notebook output stays focused on the comparison tables."
]
},
{
@@ -982,10 +923,10 @@
"id": "f62a8443",
"metadata": {
"execution": {
- "iopub.execute_input": "2026-05-28T17:29:57.618788Z",
- "iopub.status.busy": "2026-05-28T17:29:57.618723Z",
- "iopub.status.idle": "2026-05-28T17:31:23.199295Z",
- "shell.execute_reply": "2026-05-28T17:31:23.198801Z"
+ "iopub.execute_input": "2026-06-02T16:40:29.261316Z",
+ "iopub.status.busy": "2026-06-02T16:40:29.261235Z",
+ "iopub.status.idle": "2026-06-02T16:44:58.092383Z",
+ "shell.execute_reply": "2026-06-02T16:44:58.091779Z"
}
},
"outputs": [
@@ -993,1166 +934,244 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
- "PrioritySearch initialized with only long-term memory.\n",
- "Epoch: 0. Iteration: 0\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 21024.08it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n",
- "/home/xav/code/Trace-Bench/NewTrace/opto/trainer/utils.py:76: RuntimeWarning: coroutine 'async_run.._run' was never awaited\n",
- " with concurrent.futures.ThreadPoolExecutor() as executor:\n",
- "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 25944.15it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
+ "PrioritySearch: ok (OptoPrimeV2)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch: 0. Iteration: 1\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 100%|██████████| 2/2 [00:00<00:00, 17549.39it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
+ "TextGradTrainer: ok (opto.optimizers.textgrad.TextGrad)\n"
]
},
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:02<00:08, 2.92s/it]"
- ]
- },
- {
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 75%|███████▌ | 3/4 [00:03<00:00, 1.16it/s]"
+ "OpenEvolveTrainer: ok (openevolve.run_evolution)\n"
]
},
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.09it/s]"
+ "DSPyTrainer: ok (dspy.MIPROv2)\n",
+ "Completed 4 real trainer runs.\n"
]
- },
+ }
+ ],
+ "source": [
+ "if active_provider == \"none\":\n",
+ " raise RuntimeError(\"Real comparison requires OPENROUTER_API_KEY or OPENAI_API_KEY.\")\n",
+ "\n",
+ "REAL_TRAINERS = [\n",
+ " (\"PrioritySearch\", \"Trace scalar\", {\n",
+ " \"ps_steps\": 2,\n",
+ " \"ps_batches\": 1,\n",
+ " \"num_candidates\": 3,\n",
+ " \"num_proposals\": 2,\n",
+ " }, make_trace_demo_bundle),\n",
+ " (\"TextGradTrainer\", \"Trace scalar\", {\n",
+ " \"num_epochs\": 2,\n",
+ " \"batch_size\": 1,\n",
+ " \"ensure_improvement\": True,\n",
+ " \"improvement_threshold\": 1e-9,\n",
+ " \"max_tokens\": 1024,\n",
+ " }, make_trace_demo_bundle),\n",
+ " (\"OpenEvolveTrainer\", \"Trace scalar\", {\n",
+ " \"iterations\": 4,\n",
+ " \"population_size\": 8,\n",
+ " \"num_islands\": 1,\n",
+ " \"seed\": 3,\n",
+ " \"ensure_improvement\": True,\n",
+ " \"improvement_threshold\": 1e-9,\n",
+ " \"verbose\": False,\n",
+ " \"model\": os.environ.get(\"TRACE_LITELLM_MODEL\"),\n",
+ " \"api_base\": os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\"),\n",
+ " \"api_key_env\": \"OPENAI_API_KEY\",\n",
+ " \"max_tokens\": 2048,\n",
+ " \"temperature\": 0.4,\n",
+ " }, make_trace_demo_bundle),\n",
+ " (\"DSPyTrainer\", \"DSPy routing code\", {\n",
+ " \"dspy_optimizer\": \"mipro\",\n",
+ " \"dspy_lm\": make_dspy_lm(),\n",
+ " \"auto\": None,\n",
+ " \"num_candidates\": 4,\n",
+ " \"num_trials\": 5,\n",
+ " \"max_labeled_demos\": 3,\n",
+ " \"max_bootstrapped_demos\": 1,\n",
+ " \"num_threads\": 1,\n",
+ " \"seed\": 7,\n",
+ " \"verbose\": False,\n",
+ " }, make_dspy_demo_bundle),\n",
+ "]\n",
+ "\n",
+ "smoke_results = []\n",
+ "for trainer_id, task, params, bundle_factory in REAL_TRAINERS:\n",
+ " captured = io.StringIO()\n",
+ " try:\n",
+ " with contextlib.redirect_stdout(captured), contextlib.redirect_stderr(captured):\n",
+ " item = run_train_bundle(trainer_id, params=params, mode=\"real\", bundle_factory=bundle_factory)\n",
+ " item[\"captured_log_tail\"] = \"\\n\".join(captured.getvalue().splitlines()[-8:])\n",
+ " smoke_results.append(item)\n",
+ " print(f\"{trainer_id}: {item['result'].get('status')} ({item['result'].get('resolved_optimizer')})\")\n",
+ " except Exception as exc:\n",
+ " smoke_results.append({\n",
+ " \"trainer_id\": trainer_id,\n",
+ " \"task\": task,\n",
+ " \"mode\": \"real\",\n",
+ " \"status\": \"error\",\n",
+ " \"error\": f\"{type(exc).__name__}: {exc}\",\n",
+ " \"captured_log_tail\": \"\\n\".join(captured.getvalue().splitlines()[-8:]),\n",
+ " })\n",
+ " print(f\"{trainer_id}: error\")\n",
+ "\n",
+ "print(f\"Completed {len(smoke_results)} real trainer runs.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "f4ef5b90",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-06-02T16:44:58.093951Z",
+ "iopub.status.busy": "2026-06-02T16:44:58.093874Z",
+ "iopub.status.idle": "2026-06-02T16:44:58.144952Z",
+ "shell.execute_reply": "2026-06-02T16:44:58.144396Z"
+ }
+ },
+ "outputs": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.06s/it]"
- ]
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | | \n",
+ " trainer_id | \n",
+ " task | \n",
+ " mode | \n",
+ " status | \n",
+ " resolved_optimizer | \n",
+ " before_value | \n",
+ " after_value | \n",
+ " train_examples | \n",
+ " test_examples | \n",
+ " before_train_score | \n",
+ " after_train_score | \n",
+ " train_delta | \n",
+ " before_test_score | \n",
+ " after_test_score | \n",
+ " test_delta | \n",
+ " improvement | \n",
+ " error | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " PrioritySearch | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " OptoPrimeV2 | \n",
+ " 0.000000 | \n",
+ " 3.000000 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " -3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " YES | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " TextGradTrainer | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " opto.optimizers.textgrad.TextGrad | \n",
+ " 0.000000 | \n",
+ " 3.000000 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " -3.000 | \n",
+ " 0.000 | \n",
+ " 3.000 | \n",
+ " YES | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " OpenEvolveTrainer | \n",
+ " Trace scalar | \n",
+ " real | \n",
+ " ok | \n",
+ " openevolve.run_evolution | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " -3.000 | \n",
+ " -2.000 | \n",
+ " 1.000 | \n",
+ " -3.000 | \n",
+ " -2.000 | \n",
+ " 1.000 | \n",
+ " YES | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " DSPyTrainer | \n",
+ " DSPy routing code | \n",
+ " real | \n",
+ " ok | \n",
+ " dspy.MIPROv2 | \n",
+ " Return the requested routing code as a single uppercase letter. | \n",
+ " Based on the customer tier mentioned in the ticket (e.g., \"customer tier scarlet\", \"customer tier... | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " 0.000 | \n",
+ " 1.000 | \n",
+ " 1.000 | \n",
+ " YES | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 0%| | 0/4 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 23497.50it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 16416.06it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 14282.53it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch: 0. Iteration: 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 100%|██████████| 2/2 [00:00<00:00, 17189.77it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:02<00:06, 2.13s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 50%|█████ | 2/4 [00:02<00:02, 1.07s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 75%|███████▌ | 3/4 [00:04<00:01, 1.30s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:04<00:00, 1.01s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 0%| | 0/4 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 16400.02it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 25575.02it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 22961.52it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch: 0. Iteration: 3\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Backward: 100%|██████████| 2/2 [00:00<00:00, 14146.05it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 0%| | 0/4 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 25%|██▌ | 1/4 [00:01<00:05, 1.94s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 50%|█████ | 2/4 [00:02<00:01, 1.02it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:02<00:00, 1.79it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Calling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:02<00:00, 1.39it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 3 agents on 1 inputs: 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Validating newly proposed candidates: Sampling 3 agents on 1 inputs: 100%|██████████| 3/3 [00:00<00:00, 13052.81it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Sampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 15087.42it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- "Evaluating agent: 100%|██████████| 3/3 [00:00<00:00, 17549.39it/s]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,733 - INFO - Logging to /tmp/openevolve_n_kqvfcc/logs/openevolve_20260528_193053.log\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,737 - INFO - Initialized OpenAI LLM with model: openai/gpt-4o-mini\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,737 - INFO - Initialized LLM ensemble with models: openai/gpt-4o-mini (weight: 1.00)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,741 - INFO - Initialized prompt sampler\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,741 - INFO - Set custom templates: system=evaluator_system_message, user=None\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,742 - INFO - Initialized program database with 0 programs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,742 - INFO - Successfully loaded evaluation function from /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,742 - INFO - Initialized evaluator with /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,743 - INFO - Initialized OpenEvolve with /tmp/openevolve_n_kqvfcc/program_3d968e0b.py\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,743 - INFO - Adding initial program to database\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,744 - INFO - Evaluated program ba0c9536-5ae5-4298-a263-e2c8b153273e in 0.00s: score=-3.0000, feedback=target=3.0 | target=3.0 | target=3.0, artifacts={'candidate': {'int2': 0.0}}\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,744 - INFO - New MAP-Elites cell occupied in island 0: {'complexity': 5, 'diversity': 0}\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,744 - WARNING - ⚠️ No 'combined_score' metric found in evaluation results. Using average of all numeric metrics (-3.0000) for evolution guidance. For better evolution results, please modify your evaluator to return a 'combined_score' metric that properly weights different aspects of program performance.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,745 - INFO - Initialized process parallel controller with 1 workers\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,745 - INFO - Set max None tasks per child\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,746 - INFO - Started process pool with 1 processes\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,746 - INFO - Using island-based evolution with 5 islands\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,746 - INFO - Island Status:\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,746 - INFO - * Island 0: 1 programs, best=-3.0000, avg=-3.0000, diversity=0.00, gen=0 (best: ba0c9536-5ae5-4298-a263-e2c8b153273e)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,746 - INFO - Island 1: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,747 - INFO - Island 2: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,747 - INFO - Island 3: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,747 - INFO - Island 4: 0 programs, best=0.0000, avg=0.0000, diversity=0.00, gen=0\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,747 - INFO - Starting process-based evolution from iteration 1 for 1 iterations (total: 2)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,775 - INFO - Early stopping disabled\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,786 - INFO - Set custom templates: system=evaluator_system_message, user=None\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,787 - INFO - Successfully loaded evaluation function from /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,788 - INFO - Initialized evaluator with /tmp/openevolve_n_kqvfcc/evaluator_64b0ea1b.py\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:30:53,789 - INFO - Sampled model: openai/gpt-4o-mini\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,289 - INFO - Evaluated program adb409ae-1ff6-48e0-acd7-77eaa4742938 in 0.00s: score=-inf, feedback=Candidate program must contain exactly one assignment to 'candidate'.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,293 - INFO - New MAP-Elites cell occupied in island 0: {'complexity': 9, 'diversity': 5}\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,294 - INFO - Iteration 1: Program adb409ae-1ff6-48e0-acd7-77eaa4742938 (parent: ba0c9536-5ae5-4298-a263-e2c8b153273e) completed in 7.50s\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,294 - INFO - Metrics: score=-inf, feedback=Candidate program must contain exactly one assignment to 'candidate'.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,295 - WARNING - ⚠️ No 'combined_score' metric found in evaluation results. Using average of all numeric metrics (-inf) for evolution guidance. For better evolution results, please modify your evaluator to return a 'combined_score' metric that properly weights different aspects of program performance.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,295 - INFO - ✅ Evolution completed - Maximum iterations reached\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,305 - INFO - Stopped process pool\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,306 - INFO - Using tracked best program: ba0c9536-5ae5-4298-a263-e2c8b153273e\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,306 - INFO - Evolution complete. Best program has metrics: score=-3.0000, feedback=target=3.0 | target=3.0 | target=3.0, artifacts={'candidate': {'int2': 0.0}}\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-05-28 19:31:01,306 - INFO - Saved best program to /tmp/openevolve_n_kqvfcc/best/best_program.py with program info to /tmp/openevolve_n_kqvfcc/best/best_program_info.json\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026/05/28 19:31:09 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/1.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026/05/28 19:31:09 INFO dspy.teleprompt.copro_optimizer: At Depth 1/1, Evaluating Prompt Candidate #1/2 for Predictor 1 of 1.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- " 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -3.00 / 1 (-300.0%): 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -3.00 / 1 (-300.0%): 33%|███▎ | 1/3 [00:00<00:01, 1.09it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -6.00 / 2 (-300.0%): 33%|███▎ | 1/3 [00:01<00:01, 1.09it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -6.00 / 2 (-300.0%): 67%|██████▋ | 2/3 [00:01<00:00, 1.03it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 67%|██████▋ | 2/3 [00:03<00:00, 1.03it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:03<00:00, 1.05s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:03<00:00, 1.02s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026/05/28 19:31:12 INFO dspy.evaluate.evaluate: Average Metric: -9.0 / 3 (-300.0%)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026/05/28 19:31:12 INFO dspy.teleprompt.copro_optimizer: At Depth 1/1, Evaluating Prompt Candidate #2/2 for Predictor 1 of 1.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- " 0%| | 0/3 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -3.00 / 1 (-300.0%): 0%| | 0/3 [00:01, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -3.00 / 1 (-300.0%): 33%|███▎ | 1/3 [00:01<00:03, 1.53s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -6.00 / 2 (-300.0%): 33%|███▎ | 1/3 [00:03<00:03, 1.53s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -6.00 / 2 (-300.0%): 67%|██████▋ | 2/3 [00:03<00:01, 1.53s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 67%|██████▋ | 2/3 [00:04<00:01, 1.53s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:04<00:00, 1.40s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\r",
- "Average Metric: -9.00 / 3 (-300.0%): 100%|██████████| 3/3 [00:04<00:00, 1.44s/it]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026/05/28 19:31:16 INFO dspy.evaluate.evaluate: Average Metric: -9.0 / 3 (-300.0%)\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Completed 4 real trainer smoke runs.\n"
- ]
- }
- ],
- "source": [
- "if active_provider == \"none\":\n",
- " raise RuntimeError(\"Real smoke comparison requires OPENROUTER_API_KEY or OPENAI_API_KEY.\")\n",
- "\n",
- "SMOKE_TRAINERS = [\n",
- " (\"PrioritySearch\", \"Trace scalar\", {\"ps_steps\": 1, \"ps_batches\": 1, \"num_candidates\": 2, \"num_proposals\": 2}, make_trace_smoke_bundle),\n",
- " (\"TextGradTrainer\", \"Trace scalar\", {\"num_epochs\": 1, \"batch_size\": 1, \"ensure_improvement\": True, \"improvement_threshold\": 1e-9, \"max_tokens\": 1024}, make_trace_smoke_bundle),\n",
- " (\"OpenEvolveTrainer\", \"Trace scalar\", {\n",
- " \"iterations\": 1,\n",
- " \"ensure_improvement\": True,\n",
- " \"improvement_threshold\": 1e-9,\n",
- " \"verbose\": False,\n",
- " \"model\": os.environ.get(\"TRACE_LITELLM_MODEL\"),\n",
- " \"api_base\": os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\"),\n",
- " \"api_key_env\": \"OPENAI_API_KEY\",\n",
- " \"max_tokens\": 1024,\n",
- " }, make_trace_smoke_bundle),\n",
- " (\"DSPyTrainer\", \"DSPy scalar\", {\n",
- " \"dspy_optimizer\": \"copro\",\n",
- " \"dspy_lm\": make_dspy_lm(),\n",
- " \"breadth\": 2,\n",
- " \"depth\": 1,\n",
- " \"num_threads\": 1,\n",
- " \"track_stats\": False,\n",
- " }, make_dspy_smoke_bundle),\n",
- "]\n",
- "\n",
- "smoke_results = []\n",
- "for trainer_id, task, params, bundle_factory in SMOKE_TRAINERS:\n",
- " try:\n",
- " smoke_results.append(run_train_bundle(trainer_id, params=params, mode=\"real\", bundle_factory=bundle_factory))\n",
- " except Exception as exc:\n",
- " smoke_results.append({\n",
- " \"trainer_id\": trainer_id,\n",
- " \"task\": task,\n",
- " \"mode\": \"real\",\n",
- " \"status\": \"error\",\n",
- " \"error\": f\"{type(exc).__name__}: {exc}\",\n",
- " })\n",
- "\n",
- "print(f\"Completed {len(smoke_results)} real trainer smoke runs.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "f4ef5b90",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-05-28T17:31:23.200605Z",
- "iopub.status.busy": "2026-05-28T17:31:23.200529Z",
- "iopub.status.idle": "2026-05-28T17:31:23.220162Z",
- "shell.execute_reply": "2026-05-28T17:31:23.219570Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " trainer_id | \n",
- " task | \n",
- " mode | \n",
- " status | \n",
- " resolved_optimizer | \n",
- " before_value | \n",
- " after_value | \n",
- " train_examples | \n",
- " test_examples | \n",
- " before_train_score | \n",
- " after_train_score | \n",
- " train_delta | \n",
- " before_test_score | \n",
- " after_test_score | \n",
- " test_delta | \n",
- " error | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " PrioritySearch | \n",
- " Trace scalar | \n",
- " real | \n",
- " ok | \n",
- " OptoPrimeV2 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " 3 | \n",
- " 3 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " TextGradTrainer | \n",
- " Trace scalar | \n",
- " real | \n",
- " ok | \n",
- " opto.optimizers.textgrad.TextGrad | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " 3 | \n",
- " 3 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " 3.0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " OpenEvolveTrainer | \n",
- " Trace scalar | \n",
- " real | \n",
- " ok | \n",
- " openevolve.run_evolution | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 3 | \n",
- " 3 | \n",
- " -3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " None | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " DSPyTrainer | \n",
- " DSPy scalar | \n",
- " real | \n",
- " ok | \n",
- " dspy.COPRO | \n",
- " Always answer 0. | \n",
- " Provide a consistent response of \"0\" for any a... | \n",
- " 3 | \n",
- " 3 | \n",
- " -3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
- " -3.0 | \n",
- " 0.0 | \n",
- " None | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " trainer_id task mode status \\\n",
- "0 PrioritySearch Trace scalar real ok \n",
- "1 TextGradTrainer Trace scalar real ok \n",
- "2 OpenEvolveTrainer Trace scalar real ok \n",
- "3 DSPyTrainer DSPy scalar real ok \n",
- "\n",
- " resolved_optimizer before_value \\\n",
- "0 OptoPrimeV2 0.0 \n",
- "1 opto.optimizers.textgrad.TextGrad 0.0 \n",
- "2 openevolve.run_evolution 0.0 \n",
- "3 dspy.COPRO Always answer 0. \n",
- "\n",
- " after_value train_examples \\\n",
- "0 3.0 3 \n",
- "1 3.0 3 \n",
- "2 0.0 3 \n",
- "3 Provide a consistent response of \"0\" for any a... 3 \n",
- "\n",
- " test_examples before_train_score after_train_score train_delta \\\n",
- "0 3 -3.0 0.0 3.0 \n",
- "1 3 -3.0 0.0 3.0 \n",
- "2 3 -3.0 -3.0 0.0 \n",
- "3 3 -3.0 -3.0 0.0 \n",
- "\n",
- " before_test_score after_test_score test_delta error \n",
- "0 -3.0 0.0 3.0 None \n",
- "1 -3.0 0.0 3.0 None \n",
- "2 -3.0 -3.0 0.0 None \n",
- "3 -3.0 -3.0 0.0 None "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
+ "data": {
+ "text/html": [
+ "All trainers improved on held-out examples.
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
"data": {
@@ -2190,146 +1209,146 @@
" \n",
" | 0 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" after | \n",
" 0 | \n",
- " test-a | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for scarlet ticket | \n",
+ " A | \n",
+ " A | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" before | \n",
" 0 | \n",
- " test-a | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for scarlet ticket | \n",
+ " A | \n",
+ " S | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" after | \n",
" 1 | \n",
- " test-b | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for azure ticket | \n",
+ " B | \n",
+ " B | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" before | \n",
" 1 | \n",
- " test-b | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for azure ticket | \n",
+ " B | \n",
+ " A | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" after | \n",
" 2 | \n",
- " test-c | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for emerald ticket | \n",
+ " C | \n",
+ " C | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 5 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" test | \n",
" before | \n",
" 2 | \n",
- " test-c | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " routing code for emerald ticket | \n",
+ " C | \n",
+ " E | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 6 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" after | \n",
" 0 | \n",
- " train-a | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier scarlet | \n",
+ " A | \n",
+ " A | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 7 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" before | \n",
" 0 | \n",
- " train-a | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier scarlet | \n",
+ " A | \n",
+ " S | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 8 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" after | \n",
" 1 | \n",
- " train-b | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier azure | \n",
+ " B | \n",
+ " B | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 9 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" before | \n",
" 1 | \n",
- " train-b | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier azure | \n",
+ " B | \n",
+ " A | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 10 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" after | \n",
" 2 | \n",
- " train-c | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier emerald | \n",
+ " C | \n",
+ " C | \n",
+ " 1.0 | \n",
"
\n",
" \n",
" | 11 | \n",
" DSPyTrainer | \n",
- " DSPy scalar | \n",
+ " DSPy routing code | \n",
" train | \n",
" before | \n",
" 2 | \n",
- " train-c | \n",
- " 3.0 | \n",
- " 0 | \n",
- " -3.0 | \n",
+ " customer tier emerald | \n",
+ " C | \n",
+ " E | \n",
+ " 0.0 | \n",
"
\n",
" \n",
" | 12 | \n",
@@ -2340,8 +1359,8 @@
" 0 | \n",
" test-a | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 13 | \n",
@@ -2412,8 +1431,8 @@
" 1 | \n",
" test-b | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 19 | \n",
@@ -2484,8 +1503,8 @@
" 2 | \n",
" test-c | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 25 | \n",
@@ -2556,8 +1575,8 @@
" 0 | \n",
" train-a | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 31 | \n",
@@ -2628,8 +1647,8 @@
" 1 | \n",
" train-b | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 37 | \n",
@@ -2700,8 +1719,8 @@
" 2 | \n",
" train-c | \n",
" 3.0 | \n",
- " 0.0 | \n",
- " -3.0 | \n",
+ " 1.0 | \n",
+ " -2.0 | \n",
"
\n",
" \n",
" | 43 | \n",
@@ -2768,109 +1787,119 @@
""
],
"text/plain": [
- " trainer_id task split phase example input \\\n",
- "0 DSPyTrainer DSPy scalar test after 0 test-a \n",
- "1 DSPyTrainer DSPy scalar test before 0 test-a \n",
- "2 DSPyTrainer DSPy scalar test after 1 test-b \n",
- "3 DSPyTrainer DSPy scalar test before 1 test-b \n",
- "4 DSPyTrainer DSPy scalar test after 2 test-c \n",
- "5 DSPyTrainer DSPy scalar test before 2 test-c \n",
- "6 DSPyTrainer DSPy scalar train after 0 train-a \n",
- "7 DSPyTrainer DSPy scalar train before 0 train-a \n",
- "8 DSPyTrainer DSPy scalar train after 1 train-b \n",
- "9 DSPyTrainer DSPy scalar train before 1 train-b \n",
- "10 DSPyTrainer DSPy scalar train after 2 train-c \n",
- "11 DSPyTrainer DSPy scalar train before 2 train-c \n",
- "12 OpenEvolveTrainer Trace scalar test after 0 test-a \n",
- "13 OpenEvolveTrainer Trace scalar test before 0 test-a \n",
- "14 PrioritySearch Trace scalar test after 0 test-a \n",
- "15 PrioritySearch Trace scalar test before 0 test-a \n",
- "16 TextGradTrainer Trace scalar test after 0 test-a \n",
- "17 TextGradTrainer Trace scalar test before 0 test-a \n",
- "18 OpenEvolveTrainer Trace scalar test after 1 test-b \n",
- "19 OpenEvolveTrainer Trace scalar test before 1 test-b \n",
- "20 PrioritySearch Trace scalar test after 1 test-b \n",
- "21 PrioritySearch Trace scalar test before 1 test-b \n",
- "22 TextGradTrainer Trace scalar test after 1 test-b \n",
- "23 TextGradTrainer Trace scalar test before 1 test-b \n",
- "24 OpenEvolveTrainer Trace scalar test after 2 test-c \n",
- "25 OpenEvolveTrainer Trace scalar test before 2 test-c \n",
- "26 PrioritySearch Trace scalar test after 2 test-c \n",
- "27 PrioritySearch Trace scalar test before 2 test-c \n",
- "28 TextGradTrainer Trace scalar test after 2 test-c \n",
- "29 TextGradTrainer Trace scalar test before 2 test-c \n",
- "30 OpenEvolveTrainer Trace scalar train after 0 train-a \n",
- "31 OpenEvolveTrainer Trace scalar train before 0 train-a \n",
- "32 PrioritySearch Trace scalar train after 0 train-a \n",
- "33 PrioritySearch Trace scalar train before 0 train-a \n",
- "34 TextGradTrainer Trace scalar train after 0 train-a \n",
- "35 TextGradTrainer Trace scalar train before 0 train-a \n",
- "36 OpenEvolveTrainer Trace scalar train after 1 train-b \n",
- "37 OpenEvolveTrainer Trace scalar train before 1 train-b \n",
- "38 PrioritySearch Trace scalar train after 1 train-b \n",
- "39 PrioritySearch Trace scalar train before 1 train-b \n",
- "40 TextGradTrainer Trace scalar train after 1 train-b \n",
- "41 TextGradTrainer Trace scalar train before 1 train-b \n",
- "42 OpenEvolveTrainer Trace scalar train after 2 train-c \n",
- "43 OpenEvolveTrainer Trace scalar train before 2 train-c \n",
- "44 PrioritySearch Trace scalar train after 2 train-c \n",
- "45 PrioritySearch Trace scalar train before 2 train-c \n",
- "46 TextGradTrainer Trace scalar train after 2 train-c \n",
- "47 TextGradTrainer Trace scalar train before 2 train-c \n",
+ " trainer_id task split phase example \\\n",
+ "0 DSPyTrainer DSPy routing code test after 0 \n",
+ "1 DSPyTrainer DSPy routing code test before 0 \n",
+ "2 DSPyTrainer DSPy routing code test after 1 \n",
+ "3 DSPyTrainer DSPy routing code test before 1 \n",
+ "4 DSPyTrainer DSPy routing code test after 2 \n",
+ "5 DSPyTrainer DSPy routing code test before 2 \n",
+ "6 DSPyTrainer DSPy routing code train after 0 \n",
+ "7 DSPyTrainer DSPy routing code train before 0 \n",
+ "8 DSPyTrainer DSPy routing code train after 1 \n",
+ "9 DSPyTrainer DSPy routing code train before 1 \n",
+ "10 DSPyTrainer DSPy routing code train after 2 \n",
+ "11 DSPyTrainer DSPy routing code train before 2 \n",
+ "12 OpenEvolveTrainer Trace scalar test after 0 \n",
+ "13 OpenEvolveTrainer Trace scalar test before 0 \n",
+ "14 PrioritySearch Trace scalar test after 0 \n",
+ "15 PrioritySearch Trace scalar test before 0 \n",
+ "16 TextGradTrainer Trace scalar test after 0 \n",
+ "17 TextGradTrainer Trace scalar test before 0 \n",
+ "18 OpenEvolveTrainer Trace scalar test after 1 \n",
+ "19 OpenEvolveTrainer Trace scalar test before 1 \n",
+ "20 PrioritySearch Trace scalar test after 1 \n",
+ "21 PrioritySearch Trace scalar test before 1 \n",
+ "22 TextGradTrainer Trace scalar test after 1 \n",
+ "23 TextGradTrainer Trace scalar test before 1 \n",
+ "24 OpenEvolveTrainer Trace scalar test after 2 \n",
+ "25 OpenEvolveTrainer Trace scalar test before 2 \n",
+ "26 PrioritySearch Trace scalar test after 2 \n",
+ "27 PrioritySearch Trace scalar test before 2 \n",
+ "28 TextGradTrainer Trace scalar test after 2 \n",
+ "29 TextGradTrainer Trace scalar test before 2 \n",
+ "30 OpenEvolveTrainer Trace scalar train after 0 \n",
+ "31 OpenEvolveTrainer Trace scalar train before 0 \n",
+ "32 PrioritySearch Trace scalar train after 0 \n",
+ "33 PrioritySearch Trace scalar train before 0 \n",
+ "34 TextGradTrainer Trace scalar train after 0 \n",
+ "35 TextGradTrainer Trace scalar train before 0 \n",
+ "36 OpenEvolveTrainer Trace scalar train after 1 \n",
+ "37 OpenEvolveTrainer Trace scalar train before 1 \n",
+ "38 PrioritySearch Trace scalar train after 1 \n",
+ "39 PrioritySearch Trace scalar train before 1 \n",
+ "40 TextGradTrainer Trace scalar train after 1 \n",
+ "41 TextGradTrainer Trace scalar train before 1 \n",
+ "42 OpenEvolveTrainer Trace scalar train after 2 \n",
+ "43 OpenEvolveTrainer Trace scalar train before 2 \n",
+ "44 PrioritySearch Trace scalar train after 2 \n",
+ "45 PrioritySearch Trace scalar train before 2 \n",
+ "46 TextGradTrainer Trace scalar train after 2 \n",
+ "47 TextGradTrainer Trace scalar train before 2 \n",
"\n",
- " expected output score \n",
- "0 3.0 0 -3.0 \n",
- "1 3.0 0 -3.0 \n",
- "2 3.0 0 -3.0 \n",
- "3 3.0 0 -3.0 \n",
- "4 3.0 0 -3.0 \n",
- "5 3.0 0 -3.0 \n",
- "6 3.0 0 -3.0 \n",
- "7 3.0 0 -3.0 \n",
- "8 3.0 0 -3.0 \n",
- "9 3.0 0 -3.0 \n",
- "10 3.0 0 -3.0 \n",
- "11 3.0 0 -3.0 \n",
- "12 3.0 0.0 -3.0 \n",
- "13 3.0 0.0 -3.0 \n",
- "14 3.0 3.0 -0.0 \n",
- "15 3.0 0.0 -3.0 \n",
- "16 3.0 3.0 -0.0 \n",
- "17 3.0 0.0 -3.0 \n",
- "18 3.0 0.0 -3.0 \n",
- "19 3.0 0.0 -3.0 \n",
- "20 3.0 3.0 -0.0 \n",
- "21 3.0 0.0 -3.0 \n",
- "22 3.0 3.0 -0.0 \n",
- "23 3.0 0.0 -3.0 \n",
- "24 3.0 0.0 -3.0 \n",
- "25 3.0 0.0 -3.0 \n",
- "26 3.0 3.0 -0.0 \n",
- "27 3.0 0.0 -3.0 \n",
- "28 3.0 3.0 -0.0 \n",
- "29 3.0 0.0 -3.0 \n",
- "30 3.0 0.0 -3.0 \n",
- "31 3.0 0.0 -3.0 \n",
- "32 3.0 3.0 -0.0 \n",
- "33 3.0 0.0 -3.0 \n",
- "34 3.0 3.0 -0.0 \n",
- "35 3.0 0.0 -3.0 \n",
- "36 3.0 0.0 -3.0 \n",
- "37 3.0 0.0 -3.0 \n",
- "38 3.0 3.0 -0.0 \n",
- "39 3.0 0.0 -3.0 \n",
- "40 3.0 3.0 -0.0 \n",
- "41 3.0 0.0 -3.0 \n",
- "42 3.0 0.0 -3.0 \n",
- "43 3.0 0.0 -3.0 \n",
- "44 3.0 3.0 -0.0 \n",
- "45 3.0 0.0 -3.0 \n",
- "46 3.0 3.0 -0.0 \n",
- "47 3.0 0.0 -3.0 "
+ " input expected output score \n",
+ "0 routing code for scarlet ticket A A 1.0 \n",
+ "1 routing code for scarlet ticket A S 0.0 \n",
+ "2 routing code for azure ticket B B 1.0 \n",
+ "3 routing code for azure ticket B A 0.0 \n",
+ "4 routing code for emerald ticket C C 1.0 \n",
+ "5 routing code for emerald ticket C E 0.0 \n",
+ "6 customer tier scarlet A A 1.0 \n",
+ "7 customer tier scarlet A S 0.0 \n",
+ "8 customer tier azure B B 1.0 \n",
+ "9 customer tier azure B A 0.0 \n",
+ "10 customer tier emerald C C 1.0 \n",
+ "11 customer tier emerald C E 0.0 \n",
+ "12 test-a 3.0 1.0 -2.0 \n",
+ "13 test-a 3.0 0.0 -3.0 \n",
+ "14 test-a 3.0 3.0 -0.0 \n",
+ "15 test-a 3.0 0.0 -3.0 \n",
+ "16 test-a 3.0 3.0 -0.0 \n",
+ "17 test-a 3.0 0.0 -3.0 \n",
+ "18 test-b 3.0 1.0 -2.0 \n",
+ "19 test-b 3.0 0.0 -3.0 \n",
+ "20 test-b 3.0 3.0 -0.0 \n",
+ "21 test-b 3.0 0.0 -3.0 \n",
+ "22 test-b 3.0 3.0 -0.0 \n",
+ "23 test-b 3.0 0.0 -3.0 \n",
+ "24 test-c 3.0 1.0 -2.0 \n",
+ "25 test-c 3.0 0.0 -3.0 \n",
+ "26 test-c 3.0 3.0 -0.0 \n",
+ "27 test-c 3.0 0.0 -3.0 \n",
+ "28 test-c 3.0 3.0 -0.0 \n",
+ "29 test-c 3.0 0.0 -3.0 \n",
+ "30 train-a 3.0 1.0 -2.0 \n",
+ "31 train-a 3.0 0.0 -3.0 \n",
+ "32 train-a 3.0 3.0 -0.0 \n",
+ "33 train-a 3.0 0.0 -3.0 \n",
+ "34 train-a 3.0 3.0 -0.0 \n",
+ "35 train-a 3.0 0.0 -3.0 \n",
+ "36 train-b 3.0 1.0 -2.0 \n",
+ "37 train-b 3.0 0.0 -3.0 \n",
+ "38 train-b 3.0 3.0 -0.0 \n",
+ "39 train-b 3.0 0.0 -3.0 \n",
+ "40 train-b 3.0 3.0 -0.0 \n",
+ "41 train-b 3.0 0.0 -3.0 \n",
+ "42 train-c 3.0 1.0 -2.0 \n",
+ "43 train-c 3.0 0.0 -3.0 \n",
+ "44 train-c 3.0 3.0 -0.0 \n",
+ "45 train-c 3.0 0.0 -3.0 \n",
+ "46 train-c 3.0 3.0 -0.0 \n",
+ "47 train-c 3.0 0.0 -3.0 "
]
},
"metadata": {},
"output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "PrioritySearch: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n",
+ "TextGradTrainer: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n",
+ "OpenEvolveTrainer: -3.0 -> -2.0 (test_delta=1.0, improvement=YES)\n",
+ "DSPyTrainer: 0.0 -> 1.0 (test_delta=1.0, improvement=YES)\n"
+ ]
}
],
"source": [
@@ -2886,22 +1915,26 @@
" \"resolved_optimizer\": None,\n",
" \"before_value\": None,\n",
" \"after_value\": None,\n",
- " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n",
- " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n",
+ " \"train_examples\": None,\n",
+ " \"test_examples\": None,\n",
" \"before_train_score\": None,\n",
" \"after_train_score\": None,\n",
" \"train_delta\": None,\n",
" \"before_test_score\": None,\n",
" \"after_test_score\": None,\n",
" \"test_delta\": None,\n",
+ " \"improvement\": \"NO\",\n",
" \"error\": item[\"error\"],\n",
" })\n",
" continue\n",
+ "\n",
" result_status = item[\"result\"].get(\"status\")\n",
" before_train = item[\"before\"][\"train\"][\"mean_score\"]\n",
" after_train = item[\"after\"][\"train\"][\"mean_score\"]\n",
" before_test = item[\"before\"][\"test\"][\"mean_score\"]\n",
" after_test = item[\"after\"][\"test\"][\"mean_score\"]\n",
+ " test_delta = after_test - before_test\n",
+ " improved = result_status == \"ok\" and test_delta > 0\n",
" summary_rows.append({\n",
" \"trainer_id\": item[\"trainer_id\"],\n",
" \"task\": item[\"task\"],\n",
@@ -2910,14 +1943,15 @@
" \"resolved_optimizer\": item[\"result\"].get(\"resolved_optimizer\"),\n",
" \"before_value\": item[\"before\"][\"value\"],\n",
" \"after_value\": item[\"after\"][\"value\"],\n",
- " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n",
- " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n",
+ " \"train_examples\": item[\"train_examples\"],\n",
+ " \"test_examples\": item[\"test_examples\"],\n",
" \"before_train_score\": before_train,\n",
" \"after_train_score\": after_train,\n",
" \"train_delta\": after_train - before_train,\n",
" \"before_test_score\": before_test,\n",
" \"after_test_score\": after_test,\n",
- " \"test_delta\": after_test - before_test,\n",
+ " \"test_delta\": test_delta,\n",
+ " \"improvement\": \"YES\" if improved else \"NO\",\n",
" \"error\": item[\"result\"].get(\"error\"),\n",
" })\n",
" for split_name in (\"train\", \"test\"):\n",
@@ -2938,11 +1972,35 @@
"trainer_comparison = pd.DataFrame(summary_rows)\n",
"example_comparison = pd.DataFrame(example_rows)\n",
"\n",
- "display(trainer_comparison)\n",
+ "score_columns = [\"before_train_score\", \"after_train_score\", \"train_delta\", \"before_test_score\", \"after_test_score\", \"test_delta\"]\n",
+ "def mark_no_improvement(row: pd.Series) -> list[str]:\n",
+ " style = \"background-color: #ffd6d6; color: #9f0000; font-weight: 700\"\n",
+ " return [style if row.get(\"improvement\") != \"YES\" else \"\" for _ in row]\n",
+ "\n",
+ "styled_comparison = (\n",
+ " trainer_comparison.style\n",
+ " .apply(mark_no_improvement, axis=1)\n",
+ " .format({column: \"{:.3f}\" for column in score_columns})\n",
+ ")\n",
+ "display(styled_comparison)\n",
+ "\n",
+ "no_improvement = trainer_comparison[trainer_comparison[\"improvement\"] != \"YES\"]\n",
+ "if no_improvement.empty:\n",
+ " display(HTML(\"All trainers improved on held-out examples.
\"))\n",
+ "else:\n",
+ " names = \", \".join(no_improvement[\"trainer_id\"].astype(str).tolist())\n",
+ " display(HTML(f\"NO HELD-OUT IMPROVEMENT: {names}
\"))\n",
+ "\n",
"if example_rows:\n",
" display(example_comparison.sort_values([\"task\", \"split\", \"example\", \"trainer_id\", \"phase\"]).reset_index(drop=True))\n",
"else:\n",
- " print(\"No per-example outputs were produced because all real trainer runs errored.\")"
+ " print(\"No per-example outputs were produced because all real trainer runs errored.\")\n",
+ "\n",
+ "for _, row in trainer_comparison.iterrows():\n",
+ " print(\n",
+ " f\"{row['trainer_id']}: {row['before_test_score']} -> {row['after_test_score']} \"\n",
+ " f\"(test_delta={row['test_delta']}, improvement={row['improvement']})\"\n",
+ " )"
]
},
{
@@ -2952,19 +2010,7 @@
"source": [
"## 9. Practical reading guide\n",
"\n",
- "When you inspect the results, read them in this order:\n",
- "\n",
- "1. **Focused tests** \n",
- " If these fail, the branch is not ready to trust.\n",
- "\n",
- "2. **Discovery table** \n",
- " If `TextGradTrainer`, `OpenEvolveTrainer`, or `DSPyTrainer` are missing, the branch or optional packages are not properly present or installed.\n",
- "\n",
- "3. **Real train/test smoke tables** \n",
- " This confirms each trainer uses the real installed package path on three train examples and three held-out examples.\n",
- "\n",
- "4. **Error rows** \n",
- " An error row means the real trainer path failed and should be inspected before trusting comparison scores."
+ "Read red rows first. A red row means the real trainer path either failed or did not improve the held-out score. Then inspect the per-example table to see whether the trainer changed the parameter/instruction and whether the change generalized beyond the three training examples."
]
},
{
@@ -2978,18 +2024,13 @@
"- focused tests pass\n",
"- discovery shows the comparison trainers\n",
"- real `opto.optimizers.textgrad.TextGrad`, `openevolve.run_evolution`, and `dspy.LM` import successfully\n",
- "- real smoke rows for Trace, TextGrad, OpenEvolve, and DSPy complete without errors\n",
+ "- all real trainer rows complete\n",
+ "- no rows are highlighted red\n",
"\n",
- "### Partial success\n",
- "- focused tests pass\n",
- "- structural checks pass\n",
- "- one trainer reports an error row while the others complete, making the failure comparable\n",
- "\n",
- "### Failure\n",
- "- trainers are not discovered\n",
- "- focused tests fail\n",
- "- DSPy is not backed by a real `dspy.LM`\n",
- "- OpenEvolve path requires `exec` or unsafe parsing"
+ "### Needs follow-up\n",
+ "- a trainer reports an error row\n",
+ "- a trainer completes but is highlighted red because held-out score did not improve\n",
+ "- per-example outputs show memorization or no meaningful parameter/instruction change"
]
}
],
diff --git a/tests/test_textgrad_trainer.py b/tests/test_textgrad_trainer.py
index 221f84c..f1ccc41 100644
--- a/tests/test_textgrad_trainer.py
+++ b/tests/test_textgrad_trainer.py
@@ -31,12 +31,14 @@ def __call__(self, task_input: str, response: str, task_info: str):
return (1.0 if response == task_info else 0.0), f"expected {task_info}"
-def _import_textgrad_trainer(monkeypatch, proposal: str):
+def _import_textgrad_trainer(monkeypatch: pytest.MonkeyPatch, proposal: str, capture: dict[str, object] | None = None) -> types.ModuleType:
fake_module = types.ModuleType("opto.optimizers.textgrad")
class _FakeTextGrad:
def __init__(self, parameters, **_kwargs) -> None:
self.parameters = list(parameters)
+ if capture is not None:
+ capture["init_kwargs"] = _kwargs
def zero_feedback(self) -> None:
return None
@@ -99,3 +101,21 @@ def parameters(self):
train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
mode="real",
)
+
+
+def test_textgrad_trainer_forwards_llm(monkeypatch: pytest.MonkeyPatch) -> None:
+ """TextGradTrainer forwards explicit LLM objects to NewTrace TextGrad."""
+ capture: dict[str, object] = {}
+ trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello", capture=capture)
+ trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi"))
+ llm = object()
+ trainer.train(
+ guide=_DummyGuide(),
+ train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]},
+ mode="real",
+ ensure_improvement=False,
+ llm=llm,
+ )
+ init_kwargs = capture["init_kwargs"]
+ assert isinstance(init_kwargs, dict)
+ assert init_kwargs["llm"] is llm
diff --git a/trace_bench/trainers/textgrad_trainer.py b/trace_bench/trainers/textgrad_trainer.py
index a8eb5e0..4bad679 100644
--- a/trace_bench/trainers/textgrad_trainer.py
+++ b/trace_bench/trainers/textgrad_trainer.py
@@ -51,7 +51,7 @@ def _standard_optimization_step(self, guide: Any, task_input: Any, task_info: An
target = exc.exception_node
return target, float(min_score), target.create_feedback("full")
- def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]:
+ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, llm: Any = None, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]:
"""Optimize Trace parameters with the TextGrad optimizer provided by NewTrace."""
if mode not in {"real", "stub"}:
raise ValueError("mode must be either 'real' or 'stub'.")
@@ -70,7 +70,7 @@ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real"
if not inputs:
raise ValueError("train_dataset must contain at least one example.")
- optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens)
+ optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens, llm=llm)
for _ in range(num_epochs):
for start in range(0, len(inputs), batch_size):
batch_inputs = inputs[start : start + batch_size]
From 5406dd658f5500550a95e79661118336f3eda137 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Wed, 3 Jun 2026 00:34:49 +0200
Subject: [PATCH 4/8] prevent modifying dspy_trainer.py
---
trace_bench/trainers/dspy_trainer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/trace_bench/trainers/dspy_trainer.py b/trace_bench/trainers/dspy_trainer.py
index 9c1f702..665cb02 100644
--- a/trace_bench/trainers/dspy_trainer.py
+++ b/trace_bench/trainers/dspy_trainer.py
@@ -505,7 +505,7 @@ def train(
verbose=verbose,
)
finally:
- if resolved_lm is not None:
+ if resolved_lm is not None and prev_lm is not None:
_dspy.configure(lm=prev_lm)
def _train_inner(
From 638bc58f785139ac2a1b51de403bcd8110ea6539 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Thu, 11 Jun 2026 08:33:31 +0200
Subject: [PATCH 5/8] adding missing file :-) : trace_bench/llm.py
---
trace_bench/llm.py | 10 ++++++++++
1 file changed, 10 insertions(+)
create mode 100644 trace_bench/llm.py
diff --git a/trace_bench/llm.py b/trace_bench/llm.py
new file mode 100644
index 0000000..b3f6034
--- /dev/null
+++ b/trace_bench/llm.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+
+def openai_compatible_model_name(model: str) -> str:
+ """Return the model identifier expected by OpenAI-compatible clients."""
+ if not isinstance(model, str):
+ raise TypeError("model must be a string.")
+ if model.startswith("openrouter/"):
+ return model.split("/", 1)[1]
+ return model
From 4b27bd603c4d35c07f0ba4b5700eb7c976fcd7e9 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Thu, 11 Jun 2026 10:53:02 +0200
Subject: [PATCH 6/8] chore: retrigger CI against latest main
From 705a83e0c41d4a74baf04c4f0a280a2bf3521bb3 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Thu, 11 Jun 2026 11:53:37 +0200
Subject: [PATCH 7/8] ci: install HF extra and normalize batch_size alias
---
.github/workflows/ci.yml | 2 +-
trace_bench/cli.py | 1 +
trace_bench/resolve.py | 1 +
3 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb6758b..0d65e95 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install "git+https://github.com/AgentOpt/OpenTrace.git@experimental"
- python -m pip install -e .
+ python -m pip install -e ".[hf]"
- name: Validate installation and syntax
run: |
diff --git a/trace_bench/cli.py b/trace_bench/cli.py
index 745aca2..8813326 100644
--- a/trace_bench/cli.py
+++ b/trace_bench/cli.py
@@ -68,6 +68,7 @@ def _task_in_bench(task_key: str, bench: str | None) -> bool:
"num_iters",
"num_search_iterations",
"train_batch_size",
+ "batch_size",
"merge_every",
"pareto_subset_size",
"ps_steps",
diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py
index c173ed5..475b7e0 100644
--- a/trace_bench/resolve.py
+++ b/trace_bench/resolve.py
@@ -41,6 +41,7 @@ def _param_alias_map(algo_name: str) -> Dict[str, str]:
"ps_candidates": "num_candidates",
"ps_proposals": "num_proposals",
"ps_mem_update": "memory_update_frequency",
+ "batch_size": "train_batch_size",
}
)
if algo_name in _GEPA_TRAINERS:
From 93ec869c9b66985c6659f26ad0d45a48ea965f05 Mon Sep 17 00:00:00 2001
From: doxav <>
Date: Thu, 11 Jun 2026 14:02:19 +0200
Subject: [PATCH 8/8] fix: declare nest-asyncio dependency for OpenEvolve
trainer
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index a808cda..769805e 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
"tensorboardX",
"tensorboard",
"pyyaml",
+ "nest-asyncio>=1.6.0",
]
# Optional dependencies for external trainers in trace_bench/trainers/.