From 86834194bb8a08947937204c57898efa03783c23 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Wed, 27 May 2026 08:22:13 +0200 Subject: [PATCH 1/8] intermediate checkpoint --- README.md | 6 + tests/test_cli_external_trainer_validation.py | 37 +++++ tests/test_external_trainer_discovery.py | 34 +++++ tests/test_external_utils.py | 18 +++ tests/test_openevolve_trainer.py | 114 +++++++++++++++ tests/test_resolve_external_trainers.py | 16 +++ tests/test_runner_external_mode.py | 76 ++++++++++ tests/test_textgrad_trainer.py | 101 +++++++++++++ trace_bench/cli.py | 25 +++- trace_bench/resolve.py | 55 ++++--- trace_bench/runner.py | 13 +- .../trainers/README_openevolve_trainer.md | 8 ++ .../trainers/README_textgrad_trainer.md | 8 ++ trace_bench/trainers/_external_utils.py | 132 +++++++++++++++++ trace_bench/trainers/openevolve_trainer.py | 136 ++++++++++++++++++ trace_bench/trainers/textgrad_trainer.py | 100 +++++++++++++ 16 files changed, 854 insertions(+), 25 deletions(-) create mode 100644 tests/test_cli_external_trainer_validation.py create mode 100644 tests/test_external_trainer_discovery.py create mode 100644 tests/test_external_utils.py create mode 100644 tests/test_openevolve_trainer.py create mode 100644 tests/test_resolve_external_trainers.py create mode 100644 tests/test_runner_external_mode.py create mode 100644 tests/test_textgrad_trainer.py create mode 100644 trace_bench/trainers/README_openevolve_trainer.md create mode 100644 trace_bench/trainers/README_textgrad_trainer.md create mode 100644 trace_bench/trainers/_external_utils.py create mode 100644 trace_bench/trainers/openevolve_trainer.py create mode 100644 trace_bench/trainers/textgrad_trainer.py diff --git a/README.md b/README.md index 40e3aae..3cf59bc 100644 --- a/README.md +++ b/README.md @@ -128,3 +128,9 @@ PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q ## License MIT + +## External Trainers + +- `DSPyTrainer` (`trace_bench/trainers/dspy_trainer.py`) +- `TextGradTrainer` (`trace_bench/trainers/textgrad_trainer.py`) +- `OpenEvolveTrainer` (`trace_bench/trainers/openevolve_trainer.py`) diff --git a/tests/test_cli_external_trainer_validation.py b/tests/test_cli_external_trainer_validation.py new file mode 100644 index 0000000..b86f303 --- /dev/null +++ b/tests/test_cli_external_trainer_validation.py @@ -0,0 +1,37 @@ +from trace_bench.cli import _validate_trainer_params +from trace_bench.config import TrainerConfig + + +class _FakeExternalTrainer: + USES_TRACE_OPTIMIZER = False + + def train( + self, + guide, + train_dataset, + *, + iterations: int = 1, + ensure_improvement: bool = True, + verbose: bool = False, + **_kwargs, + ): + return {} + + +def test_validate_trainer_params_uses_train_signature(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer) + trainer = TrainerConfig( + id="OpenEvolveTrainer", + params_variants=[{"iterations": 2, "ensure_improvement": False}], + ) + errors = [] + _validate_trainer_params(trainer, errors) + assert errors == [] + + +def test_validate_trainer_params_rejects_unknown_kwarg(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer) + trainer = TrainerConfig(id="OpenEvolveTrainer", params_variants=[{"unknown": 1}]) + errors = [] + _validate_trainer_params(trainer, errors) + assert errors == ["unknown trainer kwarg 'unknown' for OpenEvolveTrainer"] diff --git a/tests/test_external_trainer_discovery.py b/tests/test_external_trainer_discovery.py new file mode 100644 index 0000000..78bc5e9 --- /dev/null +++ b/tests/test_external_trainer_discovery.py @@ -0,0 +1,34 @@ +import importlib +import sys +import types + +from trace_bench.registry import discover_trainers + + +def _install_fake_external_dependencies(monkeypatch) -> None: + fake_textgrad_module = types.ModuleType("opto.optimizers.textgrad") + + class _FakeTextGrad: + def __init__(self, parameters, **_kwargs) -> None: + self.parameters = list(parameters) + + fake_textgrad_module.TextGrad = _FakeTextGrad + monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_textgrad_module) + + fake_openevolve_module = types.ModuleType("openevolve") + fake_openevolve_module.run_evolution = lambda **_kwargs: {"best_code": 'candidate = {}'} + monkeypatch.setitem(sys.modules, "openevolve", fake_openevolve_module) + + +def test_discover_trainers_lists_new_external_trainers_when_dependencies_are_available(monkeypatch) -> None: + _install_fake_external_dependencies(monkeypatch) + + import trace_bench.trainers.textgrad_trainer as textgrad_trainer + import trace_bench.trainers.openevolve_trainer as openevolve_trainer + + importlib.reload(textgrad_trainer) + importlib.reload(openevolve_trainer) + + specs = {spec.id: spec for spec in discover_trainers()} + assert specs["TextGradTrainer"].available is True + assert specs["OpenEvolveTrainer"].available is True diff --git a/tests/test_external_utils.py b/tests/test_external_utils.py new file mode 100644 index 0000000..d72a41e --- /dev/null +++ b/tests/test_external_utils.py @@ -0,0 +1,18 @@ +from trace_bench.trainers._external_utils import apply_parameter_updates + + +class _ReadOnlyDataParam: + def __init__(self, value: str) -> None: + self._data = value + + @property + def data(self) -> str: + return self._data + + +def test_apply_parameter_updates_falls_back_to_private_data_slot_when_data_property_has_no_setter() -> None: + parameter = _ReadOnlyDataParam("before") + + apply_parameter_updates({parameter: "after"}) + + assert parameter.data == "after" diff --git a/tests/test_openevolve_trainer.py b/tests/test_openevolve_trainer.py new file mode 100644 index 0000000..1c4bfbd --- /dev/null +++ b/tests/test_openevolve_trainer.py @@ -0,0 +1,114 @@ +import importlib +import sys +import types + +import pytest + + +class _DummyParam: + def __init__(self, name: str, value: str) -> None: + self.name = name + self.py_name = name + self.data = value + self.trainable = True + + +class _DummyAgent: + def __init__(self, greeting: str = "Hi") -> None: + self.greeting = _DummyParam("greeting", greeting) + + def parameters(self): + return [self.greeting] + + def __call__(self, query: str) -> str: + name = query.split()[-1].strip("!.?") + return f"{self.greeting.data}, {name}!" + + +class _DummyGuide: + def __call__(self, task_input: str, response: str, task_info: str): + del task_input + return (1.0 if response == task_info else 0.0), f"expected {task_info}" + + +def _import_openevolve_trainer(monkeypatch, best_code: str): + fake_module = types.ModuleType("openevolve") + + def _run_evolution(*, initial_program, evaluator, iterations, **_kwargs): + del initial_program, evaluator, iterations + return types.SimpleNamespace(best_code=best_code) + + fake_module.run_evolution = _run_evolution + monkeypatch.setitem(sys.modules, "openevolve", fake_module) + sys.modules.pop("trace_bench.trainers.openevolve_trainer", None) + return importlib.import_module("trace_bench.trainers.openevolve_trainer") + + +def test_openevolve_trainer_updates_parameter(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + result = trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "openevolve.run_evolution" + assert trainer.param.greeting.data == "Hello" + + +def test_openevolve_trainer_rejects_worse_candidate(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Bad"}\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=True, + ) + assert trainer.param.greeting.data == "Hello" + + +def test_openevolve_trainer_rejects_invalid_candidate_program(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='print("bad candidate")\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello")) + with pytest.raises(ValueError, match="Candidate program"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + + +def test_openevolve_trainer_requires_trainable_parameters(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + + class _NoTrainables: + def parameters(self): + return [] + + trainer = trainer_module.OpenEvolveTrainer(_NoTrainables()) + with pytest.raises(ValueError, match="no trainable parameters"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ) diff --git a/tests/test_resolve_external_trainers.py b/tests/test_resolve_external_trainers.py new file mode 100644 index 0000000..9a7c7d6 --- /dev/null +++ b/tests/test_resolve_external_trainers.py @@ -0,0 +1,16 @@ +from trace_bench.resolve import resolve_trainer_kwargs + + +def test_resolve_trainer_kwargs_does_not_inject_gepa_defaults_for_external_trainers() -> None: + assert resolve_trainer_kwargs({}, "TextGradTrainer") == {} + assert resolve_trainer_kwargs({"iterations": 3}, "OpenEvolveTrainer") == { + "iterations": 3 + } + + +def test_resolve_trainer_kwargs_preserves_gepa_defaults() -> None: + resolved = resolve_trainer_kwargs({}, "GEPA-UCB") + assert resolved["num_search_iterations"] == 1 + assert resolved["train_batch_size"] == 2 + assert resolved["merge_every"] == 2 + assert resolved["pareto_subset_size"] == 2 diff --git a/tests/test_runner_external_mode.py b/tests/test_runner_external_mode.py new file mode 100644 index 0000000..3b5b4a8 --- /dev/null +++ b/tests/test_runner_external_mode.py @@ -0,0 +1,76 @@ +from trace_bench.config import TrainerConfig +from trace_bench.runner import _train_bundle + + +class _DummyAgent: + def parameters(self): + return [] + + def __call__(self, query): + return query + + +class _DummyGuide: + def __call__(self, task_input, response, task_info): + return 1.0, "ok" + + +class _FakeExternalTrainer: + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent, logger=None): + del logger + self.param = agent + + def train(self, guide, train_dataset, mode="real", **_kwargs): + del guide, train_dataset + return {"status": "ok", "resolved_optimizer": mode} + + +def test_runner_passes_mode_to_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeExternalTrainer) + bundle = { + "param": _DummyAgent(), + "guide": _DummyGuide(), + "train_dataset": {"inputs": ["x"], "infos": ["y"]}, + "optimizer_kwargs": {}, + "metadata": {}, + } + trainer = TrainerConfig(id="FakeExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "stub" + + +class _FakeNonDSPyExternalTrainer(_FakeExternalTrainer): + FRAMEWORK = "trace" + + def train(self, guide, train_dataset, mode="real", **kwargs): + del guide, train_dataset, mode + return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")} + + +class _FakeDSPyExternalTrainer(_FakeExternalTrainer): + FRAMEWORK = "dspy" + + def train(self, guide, train_dataset, mode="real", **kwargs): + del guide, train_dataset, mode + return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")} + + +def test_runner_does_not_inject_dspy_stub_into_non_dspy_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeNonDSPyExternalTrainer) + bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}} + trainer = TrainerConfig(id="FakeNonDSPyExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "absent" + + +def test_runner_injects_dspy_stub_only_for_dspy_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeDSPyExternalTrainer) + bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}} + trainer = TrainerConfig(id="FakeDSPyExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "stub" diff --git a/tests/test_textgrad_trainer.py b/tests/test_textgrad_trainer.py new file mode 100644 index 0000000..221f84c --- /dev/null +++ b/tests/test_textgrad_trainer.py @@ -0,0 +1,101 @@ +import importlib +import sys +import types + +import pytest + + +class _DummyParam: + def __init__(self, name: str, value: str) -> None: + self.name = name + self.py_name = name + self.data = value + self.trainable = True + + +class _DummyAgent: + def __init__(self, greeting: str = "Hi") -> None: + self.greeting = _DummyParam("greeting", greeting) + + def parameters(self): + return [self.greeting] + + def __call__(self, query: str) -> str: + name = query.split()[-1].strip("!.?") + return f"{self.greeting.data}, {name}!" + + +class _DummyGuide: + def __call__(self, task_input: str, response: str, task_info: str): + del task_input + return (1.0 if response == task_info else 0.0), f"expected {task_info}" + + +def _import_textgrad_trainer(monkeypatch, proposal: str): + fake_module = types.ModuleType("opto.optimizers.textgrad") + + class _FakeTextGrad: + def __init__(self, parameters, **_kwargs) -> None: + self.parameters = list(parameters) + + def zero_feedback(self) -> None: + return None + + def backward(self, target, feedback) -> None: + del target, feedback + return None + + def step(self, bypassing=False, verbose=False): + del bypassing, verbose + return {self.parameters[0]: proposal} + + fake_module.TextGrad = _FakeTextGrad + monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_module) + sys.modules.pop("trace_bench.trainers.textgrad_trainer", None) + return importlib.import_module("trace_bench.trainers.textgrad_trainer") + + +def test_textgrad_trainer_updates_parameter(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello") + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi")) + result = trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + num_epochs=1, + batch_size=1, + ensure_improvement=False, + ) + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "opto.optimizers.textgrad.TextGrad" + assert trainer.param.greeting.data == "Hello" + + +def test_textgrad_trainer_rejects_worse_candidate(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Bad") + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hello")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + num_epochs=1, + batch_size=1, + ensure_improvement=True, + ) + assert trainer.param.greeting.data == "Hello" + + +def test_textgrad_trainer_requires_trainable_parameters(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello") + + class _NoTrainables: + def parameters(self): + return [] + + trainer = trainer_module.TextGradTrainer(_NoTrainables()) + with pytest.raises(ValueError, match="no trainable parameters"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + ) diff --git a/trace_bench/cli.py b/trace_bench/cli.py index 853b9b8..745aca2 100644 --- a/trace_bench/cli.py +++ b/trace_bench/cli.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import inspect import json from datetime import datetime from pathlib import Path @@ -15,7 +16,7 @@ load_task_bundle, ) from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs -from trace_bench.runner import BenchRunner, _has_trainables +from trace_bench.runner import BenchRunner, _has_trainables, _resolve_algorithm from trace_bench.artifacts import init_run_dir, write_manifest from trace_bench.ui import launch_ui @@ -92,6 +93,25 @@ def _resolve_symbol(module_name: str, symbol: str) -> bool: return False +def _allowed_trainer_kwargs_for(trainer_id: str) -> set[str]: + """Return the trainer kwargs accepted by strict validation for a trainer id.""" + allowed = set(_ALLOWED_TRAINER_KWARGS) + resolved = _resolve_algorithm(trainer_id) + if not isinstance(resolved, type): + return allowed + + try: + signature = inspect.signature(resolved.train) + except (TypeError, ValueError): + return allowed + + ignored = {"self", "guide", "train_dataset", "validate_dataset", "test_dataset", "mode"} + for name, parameter in signature.parameters.items(): + if name in ignored: + continue + if parameter.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY): + allowed.add(name) + return allowed def _normalize_logger_override(raw: str | None) -> str | None: @@ -128,9 +148,10 @@ def _default_timeout(mode: str) -> float: def _validate_trainer_params(trainer, errors: list[str]) -> None: + allowed_kwargs = _allowed_trainer_kwargs_for(trainer.id) for params in trainer.params_variants or [{}]: for key in params.keys(): - if key not in _ALLOWED_TRAINER_KWARGS: + if key not in allowed_kwargs: errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}") if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer): diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py index e285341..c173ed5 100644 --- a/trace_bench/resolve.py +++ b/trace_bench/resolve.py @@ -4,34 +4,55 @@ _FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"} +_GEPA_TRAINERS = {"GEPA-Base", "GEPA-UCB", "GEPA-Beam"} def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: + """Return default kwargs for built-in Trace search trainers only.""" if algo_name == "PrioritySearch": - return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) + return dict( + num_epochs=1, + num_steps=1, + num_batches=1, + num_candidates=2, + num_proposals=2, + ) if algo_name == "GEPA-Base": return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) - # GEPA-UCB and GEPA-Beam use num_search_iterations - return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + if algo_name in {"GEPA-UCB", "GEPA-Beam"}: + return dict( + num_search_iterations=1, + train_batch_size=2, + merge_every=2, + pareto_subset_size=2, + ) + return {} def _param_alias_map(algo_name: str) -> Dict[str, str]: - base = { + alias_map = { "threads": "num_threads", - "ps_steps": "num_steps", - "ps_batches": "num_batches", - "ps_candidates": "num_candidates", - "ps_proposals": "num_proposals", - "ps_mem_update": "memory_update_frequency", - "gepa_train_bs": "train_batch_size", - "gepa_merge_every": "merge_every", - "gepa_pareto_subset": "pareto_subset_size", } - if algo_name == "GEPA-Base": - base["gepa_iters"] = "num_iters" - else: - base["gepa_iters"] = "num_search_iterations" - return base + if algo_name == "PrioritySearch": + alias_map.update( + { + "ps_steps": "num_steps", + "ps_batches": "num_batches", + "ps_candidates": "num_candidates", + "ps_proposals": "num_proposals", + "ps_mem_update": "memory_update_frequency", + } + ) + if algo_name in _GEPA_TRAINERS: + alias_map.update( + { + "gepa_train_bs": "train_batch_size", + "gepa_merge_every": "merge_every", + "gepa_pareto_subset": "pareto_subset_size", + } + ) + alias_map["gepa_iters"] = "num_iters" if algo_name == "GEPA-Base" else "num_search_iterations" + return alias_map def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: diff --git a/trace_bench/runner.py b/trace_bench/runner.py index c5bc838..2763dab 100644 --- a/trace_bench/runner.py +++ b/trace_bench/runner.py @@ -512,13 +512,14 @@ def _dummy_response(*_args, **_kwargs): uses_trace_optimizer = getattr(algo, "USES_TRACE_OPTIMIZER", True) - # For DSPy-style external trainers: propagate mode='stub' as - # dspy_lm='stub' so they configure DummyLM without requiring an explicit - # dspy_lm param in the config. OpenTrace trainers do not all accept this - # keyword, so keep the injection limited to external trainers that manage - # their own optimization loop. + if not uses_trace_optimizer: + kwargs.setdefault("mode", mode) + + # Keep backward-compatible DSPy stub support, but do not leak DSPy-only + # kwargs into unrelated external trainers. if mode == "stub" and not uses_trace_optimizer: - kwargs.setdefault("dspy_lm", "stub") + if getattr(algo, "FRAMEWORK", None) == "dspy": + kwargs.setdefault("dspy_lm", "stub") # Pass through multi-objective config from bundle if present objective_config = bundle.get("objective_config") diff --git a/trace_bench/trainers/README_openevolve_trainer.md b/trace_bench/trainers/README_openevolve_trainer.md new file mode 100644 index 0000000..e719eab --- /dev/null +++ b/trace_bench/trainers/README_openevolve_trainer.md @@ -0,0 +1,8 @@ +# OpenEvolveTrainer + +`OpenEvolveTrainer` is an external Trace-Bench trainer wrapper for `openevolve.run_evolution`. + +- Evolves a **safe literal** candidate mapping of trainable parameter values. +- Never executes candidate code via `exec`. +- Parses candidates using `ast.parse` and `ast.literal_eval` only. +- Can optionally keep only improving updates (`ensure_improvement=True`). diff --git a/trace_bench/trainers/README_textgrad_trainer.md b/trace_bench/trainers/README_textgrad_trainer.md new file mode 100644 index 0000000..13621fa --- /dev/null +++ b/trace_bench/trainers/README_textgrad_trainer.md @@ -0,0 +1,8 @@ +# TextGradTrainer + +`TextGradTrainer` is an external Trace-Bench trainer wrapper for `opto.optimizers.textgrad.TextGrad`. + +- Thin wrapper around NewTrace TextGrad. +- Supports `mode=stub` and `mode=real`. +- Uses trainable Trace parameters only. +- Can optionally keep only improving updates (`ensure_improvement=True`). diff --git a/trace_bench/trainers/_external_utils.py b/trace_bench/trainers/_external_utils.py new file mode 100644 index 0000000..fc4ebb8 --- /dev/null +++ b/trace_bench/trainers/_external_utils.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from copy import deepcopy +import importlib +from typing import Any, Dict, List, Mapping, Sequence, Tuple + + +def collect_trainable_parameters(model: Any) -> List[Any]: + """Return trainable parameter-like objects from a model or standalone parameter.""" + if hasattr(model, "parameters") and callable(model.parameters): + parameters = [parameter for parameter in model.parameters() if getattr(parameter, "trainable", False)] + if parameters: + return list(parameters) + raise ValueError("Model.parameters() returned no trainable parameters.") + if getattr(model, "trainable", False) and hasattr(model, "data"): + return [model] + raise TypeError("Expected a model with parameters() or a standalone trainable parameter-like object.") + + +def coerce_like(example_value: Any, candidate_value: Any) -> Any: + """Coerce a candidate value to the same literal-like type as the current parameter value.""" + if isinstance(example_value, bool): + if not isinstance(candidate_value, bool): + raise TypeError("Expected a boolean candidate value.") + return candidate_value + if isinstance(example_value, int) and not isinstance(example_value, bool): + if isinstance(candidate_value, bool): + raise TypeError("Expected an integer candidate value.") + if isinstance(candidate_value, int): + return candidate_value + if isinstance(candidate_value, float) and candidate_value.is_integer(): + return int(candidate_value) + raise TypeError("Expected an integer candidate value.") + if isinstance(example_value, float): + if isinstance(candidate_value, bool) or not isinstance(candidate_value, (int, float)): + raise TypeError("Expected a numeric candidate value.") + return float(candidate_value) + if isinstance(example_value, str): + if not isinstance(candidate_value, str): + raise TypeError("Expected a string candidate value.") + return candidate_value + if isinstance(example_value, list): + if not isinstance(candidate_value, list): + raise TypeError("Expected a list candidate value.") + return candidate_value + if isinstance(example_value, tuple): + if not isinstance(candidate_value, (list, tuple)): + raise TypeError("Expected a sequence candidate value.") + return tuple(candidate_value) + if isinstance(example_value, dict): + if not isinstance(candidate_value, dict): + raise TypeError("Expected a mapping candidate value.") + return candidate_value + raise TypeError(f"Unsupported trainable parameter value type: {type(example_value).__name__}.") + + +def snapshot_parameter_values(parameters: Sequence[Any]) -> Dict[Any, Any]: + """Deep-copy the current values of the provided parameters.""" + return {parameter: deepcopy(getattr(parameter, "data")) for parameter in parameters} + + +def _set_parameter_value(parameter: Any, value: Any) -> None: + """Set a parameter-like object's value in a way that works across Trace variants.""" + try: + setattr(parameter, "data", deepcopy(value)) + return + except Exception: + pass + if hasattr(parameter, "_data"): + setattr(parameter, "_data", deepcopy(value)) + return + raise TypeError("Parameter object does not expose a writable data field.") + + +def restore_parameter_values(snapshot: Mapping[Any, Any]) -> None: + """Restore a parameter snapshot created by snapshot_parameter_values().""" + for parameter, value in snapshot.items(): + _set_parameter_value(parameter, value) + + +def apply_parameter_updates(update_dict: Mapping[Any, Any]) -> None: + """Apply candidate parameter updates in place.""" + for parameter, value in update_dict.items(): + _set_parameter_value(parameter, value) + + +def score_model_on_dataset(agent: Any, guide: Any, dataset: Dict[str, Any], *, suppress_exceptions: bool = False) -> Tuple[float, List[str]]: + """Evaluate an agent on a Trace-Bench dataset and return mean score plus feedback strings.""" + inputs = dataset.get("inputs") or [] + infos = dataset.get("infos") or dataset.get("info") or [] + if len(inputs) != len(infos): + raise ValueError("Dataset 'inputs' and 'infos' must have the same length.") + if not inputs: + raise ValueError("Dataset must contain at least one example.") + + scores: List[float] = [] + feedbacks: List[str] = [] + for index, (task_input, task_info) in enumerate(zip(inputs, infos)): + try: + output = agent(task_input) + response = getattr(output, "data", output) + score, feedback = guide(task_input, response, task_info) + scores.append(float(score)) + feedbacks.append(str(feedback)) + except Exception as exc: + if not suppress_exceptions: + raise + scores.append(float("-inf")) + feedbacks.append(f"evaluation_error[{index}]: {type(exc).__name__}") + + return sum(scores) / len(scores), feedbacks + + +def summarize_feedback(feedbacks: Sequence[str], *, max_items: int = 3) -> str: + """Return a compact textual summary of the first few feedback strings.""" + items = [str(item) for item in feedbacks[:max_items]] + return " | ".join(items) + + +def resolve_external_trainer_base() -> type: + """Resolve the most compatible trainer base across OpenTrace variants.""" + try: + module = importlib.import_module("opto.trainer.algorithms.algorithm") + except Exception: + return object + + for class_name in ("Trainer", "AbstractAlgorithm", "Algorithm", "AlgorithmBase"): + trainer_base = getattr(module, class_name, None) + if isinstance(trainer_base, type): + return trainer_base + + return object diff --git a/trace_bench/trainers/openevolve_trainer.py b/trace_bench/trainers/openevolve_trainer.py new file mode 100644 index 0000000..43b211a --- /dev/null +++ b/trace_bench/trainers/openevolve_trainer.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import ast +import inspect +from pathlib import Path +from pprint import pformat +from threading import RLock +from typing import Any, Dict, List, Optional, Union + +try: + from openevolve import run_evolution as _run_evolution +except Exception as exc: + raise ImportError("OpenEvolveTrainer requires the optional 'openevolve' package.") from exc + +from trace_bench.trainers._external_utils import apply_parameter_updates, collect_trainable_parameters, coerce_like, resolve_external_trainer_base, restore_parameter_values, score_model_on_dataset, snapshot_parameter_values, summarize_feedback + +_TrainerBase = resolve_external_trainer_base() +_EVALUATION_LOCK = RLock() + +def _validate_literal_value(value: Any) -> None: + """Ensure a parameter value round-trips through repr() and ast.literal_eval().""" + try: + ast.literal_eval(repr(value)) + except Exception as exc: + raise TypeError(f"OpenEvolveTrainer supports only literal-like parameter values; got {type(value).__name__}.") from exc + +def _serialize_candidate_program(parameters: List[Any]) -> str: + """Serialize the current trainable parameter values to a safe Python literal program.""" + payload: Dict[str, Any] = {} + for parameter in parameters: + value = getattr(parameter, "data") + _validate_literal_value(value) + payload[parameter.py_name] = value + return "candidate = " + pformat(payload, sort_dicts=True) + "\n" + +def _parse_candidate_program(program_text: str, parameters: List[Any]) -> Dict[Any, Any]: + """Parse a candidate program and coerce it back into parameter values.""" + try: + syntax_tree = ast.parse(program_text, mode="exec") + except SyntaxError as exc: + raise ValueError("Candidate program must be valid Python.") from exc + if len(syntax_tree.body) != 1 or not isinstance(syntax_tree.body[0], ast.Assign): + raise ValueError("Candidate program must contain exactly one assignment to 'candidate'.") + assignment = syntax_tree.body[0] + if len(assignment.targets) != 1 or not isinstance(assignment.targets[0], ast.Name) or assignment.targets[0].id != "candidate": + raise ValueError("Candidate program must assign a literal mapping to 'candidate'.") + try: + candidate_mapping = ast.literal_eval(assignment.value) + except Exception as exc: + raise ValueError("Candidate mapping must be parseable via ast.literal_eval().") from exc + if not isinstance(candidate_mapping, dict): + raise ValueError("Candidate mapping must be a dict.") + expected_names = {parameter.py_name for parameter in parameters} + if set(candidate_mapping.keys()) != expected_names: + raise ValueError("Candidate mapping keys must exactly match the trainable parameter names.") + update_dict: Dict[Any, Any] = {} + for parameter in parameters: + update_dict[parameter] = coerce_like(getattr(parameter, "data"), candidate_mapping[parameter.py_name]) + return update_dict + +def _extract_best_code(result: Any) -> str: + """Extract the best candidate program text from an OpenEvolve result object.""" + if isinstance(result, dict): + for key in ("best_code", "code", "best_program"): + value = result.get(key) + if isinstance(value, str): + return value + for attribute in ("best_code", "code", "best_program"): + value = getattr(result, attribute, None) + if isinstance(value, str): + return value + raise ValueError("run_evolution did not return a best_code-like string.") + +def _filter_supported_kwargs(function: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Drop kwargs that are not accepted by the target callable.""" + try: + signature = inspect.signature(function) + except (TypeError, ValueError): + return dict(kwargs) + if any(parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in signature.parameters.values()): + return dict(kwargs) + return {key: value for key, value in kwargs.items() if key in signature.parameters} + +class OpenEvolveTrainer(_TrainerBase): + """Trace-Bench wrapper around OpenEvolve using safe literal parameter serialization.""" + + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None: + del optimizer + self.param = agent + self.logger = logger + + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: + """Optimize Trace parameters with OpenEvolve via a literal candidate mapping.""" + if mode not in {"real", "stub"}: + raise ValueError("mode must be either 'real' or 'stub'.") + if iterations < 1: + raise ValueError("iterations must be at least 1.") + if mode == "stub": + return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"} + + parameters = collect_trainable_parameters(self.param) + evaluation_dataset = validate_dataset or train_dataset + baseline_snapshot = snapshot_parameter_values(parameters) + baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + + def evaluator(candidate_path: str) -> Dict[str, Any]: + program_text = Path(candidate_path).read_text(encoding="utf-8") + try: + update_dict = _parse_candidate_program(program_text, parameters) + except (TypeError, ValueError) as exc: + return {"score": float("-inf"), "feedback": str(exc)} + with _EVALUATION_LOCK: + snapshot = snapshot_parameter_values(parameters) + try: + apply_parameter_updates(update_dict) + score, feedbacks = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + finally: + restore_parameter_values(snapshot) + return {"score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}} + + initial_program = _serialize_candidate_program(parameters) + run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False} + filtered_kwargs = _filter_supported_kwargs(_run_evolution, {key: value for key, value in run_kwargs.items() if value is not None}) + result = _run_evolution(initial_program=initial_program, evaluator=evaluator, **filtered_kwargs) + + best_code = _extract_best_code(result) + best_update = _parse_candidate_program(best_code, parameters) + apply_parameter_updates(best_update) + if ensure_improvement: + candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + if candidate_score < baseline_score + improvement_threshold: + restore_parameter_values(baseline_snapshot) + + return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"} diff --git a/trace_bench/trainers/textgrad_trainer.py b/trace_bench/trainers/textgrad_trainer.py new file mode 100644 index 0000000..a8eb5e0 --- /dev/null +++ b/trace_bench/trainers/textgrad_trainer.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, Union + +from opto import trace + +try: + from opto.optimizers.textgrad import TextGrad as _TraceTextGrad +except Exception as exc: + raise ImportError("TextGradTrainer requires opto.optimizers.textgrad from the NewTrace fork.") from exc + +from trace_bench.trainers._external_utils import ( + apply_parameter_updates, + collect_trainable_parameters, + coerce_like, + resolve_external_trainer_base, + restore_parameter_values, + score_model_on_dataset, + snapshot_parameter_values, +) + + +_TrainerBase = resolve_external_trainer_base() + + +class TextGradTrainer(_TrainerBase): + """Trace-Bench wrapper around the Trace-native TextGrad optimizer from NewTrace.""" + + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None: + del optimizer + self.param = agent + self.logger = logger + + def _normalize_updates(self, update_dict: Dict[Any, Any]) -> Dict[Any, Any]: + """Coerce proposed values back to the current parameter types.""" + normalized: Dict[Any, Any] = {} + for parameter, candidate_value in update_dict.items(): + normalized[parameter] = coerce_like(getattr(parameter, "data"), candidate_value) + return normalized + + def _standard_optimization_step(self, guide: Any, task_input: Any, task_info: Any, min_score: float) -> tuple[Any, float, Any]: + """Run one forward/feedback step, preserving Trace execution errors as feedback.""" + try: + target = self.param(task_input) + response = getattr(target, "data", target) + score, feedback = guide(task_input, response, task_info) + return target, float(score), feedback + except trace.ExecutionError as exc: + target = exc.exception_node + return target, float(min_score), target.create_feedback("full") + + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: + """Optimize Trace parameters with the TextGrad optimizer provided by NewTrace.""" + if mode not in {"real", "stub"}: + raise ValueError("mode must be either 'real' or 'stub'.") + if num_epochs < 1: + raise ValueError("num_epochs must be at least 1.") + if batch_size < 1: + raise ValueError("batch_size must be at least 1.") + if mode == "stub": + return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"} + + parameters = collect_trainable_parameters(self.param) + inputs = train_dataset.get("inputs") or [] + infos = train_dataset.get("infos") or train_dataset.get("info") or [] + if len(inputs) != len(infos): + raise ValueError("train_dataset 'inputs' and 'infos' must have the same length.") + if not inputs: + raise ValueError("train_dataset must contain at least one example.") + + optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens) + for _ in range(num_epochs): + for start in range(0, len(inputs), batch_size): + batch_inputs = inputs[start : start + batch_size] + batch_infos = infos[start : start + batch_size] + evaluation_dataset = validate_dataset or {"inputs": batch_inputs, "infos": batch_infos} + optimizer.zero_feedback() + for task_input, task_info in zip(batch_inputs, batch_infos): + target, _score, feedback = self._standard_optimization_step(guide=guide, task_input=task_input, task_info=task_info, min_score=min_score) + optimizer.backward(target, feedback) + + proposal = optimizer.step(bypassing=True, verbose=verbose) + normalized = self._normalize_updates(proposal) + if not normalized: + continue + + snapshot = snapshot_parameter_values(parameters) + baseline_score: Optional[float] = None + if ensure_improvement: + baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + + apply_parameter_updates(normalized) + if ensure_improvement and baseline_score is not None: + candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + if candidate_score < baseline_score + improvement_threshold: + restore_parameter_values(snapshot) + + return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"} From 34161d317351aad15b6ba82cc5edee8c02d20766 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Tue, 2 Jun 2026 08:03:10 +0200 Subject: [PATCH 2/8] stabilized notebook --- ...tgrad_openevolve_evaluation_notebook.ipynb | 3017 +++++++++++++++++ tests/test_dspy_trainer.py | 28 + tests/test_llm_utils.py | 19 + tests/test_openevolve_trainer.py | 101 +- trace_bench/trainers/dspy_trainer.py | 2 +- trace_bench/trainers/openevolve_trainer.py | 79 +- 6 files changed, 3237 insertions(+), 9 deletions(-) create mode 100644 notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb create mode 100644 tests/test_dspy_trainer.py create mode 100644 tests/test_llm_utils.py diff --git a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb new file mode 100644 index 0000000..87d0955 --- /dev/null +++ b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb @@ -0,0 +1,3017 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6fa48e6e", + "metadata": {}, + "source": [ + "# Trainer comparison notebook\n", + "\n", + "This notebook validates and compares four real trainer paths:\n", + "\n", + "- `PrioritySearch` as the Trace baseline\n", + "- `TextGradTrainer`\n", + "- `OpenEvolveTrainer`\n", + "- `DSPyTrainer`\n", + "\n", + "It checks out the `textgrad_openevolve` branch, installs the real optional packages when needed, runs focused structural checks, and then runs a tiny real train/test comparison with OpenRouter or OpenAI.\n", + "The notebook assumes the `textgrad_openevolve` branch contains the trainer integration under test." + ] + }, + { + "cell_type": "markdown", + "id": "0f51598c", + "metadata": {}, + "source": [ + "## What this notebook verifies\n", + "\n", + "- required trainer packages import from real installations\n", + "- Trace-Bench discovers the trainer classes\n", + "- focused tests and compile checks pass\n", + "- every comparison row uses three train examples and three held-out examples\n", + "- result tables show trainer status, optimizer identity, before/after scores, and per-example outputs" + ] + }, + { + "cell_type": "markdown", + "id": "b4ae0512", + "metadata": {}, + "source": [ + "## High-level interpretation guide\n", + "\n", + "Use this notebook in three layers:\n", + "\n", + "1. **Code-level correctness**\n", + " - Do the new trainers exist?\n", + " - Are they discovered?\n", + " - Do focused tests pass?\n", + "\n", + "2. **Behavior-level smoke checks**\n", + " - Do the trainer paths run against real installed packages?\n", + " - Do they produce comparable before/after rows?\n", + "\n", + "3. **Practical comparison**\n", + " - Which trainers improve on the tiny task?\n", + " - Which trainers complete but do not improve in this small budget?" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "56d885a1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:44.264090Z", + "iopub.status.busy": "2026-05-28T17:29:44.263974Z", + "iopub.status.idle": "2026-05-28T17:29:44.268749Z", + "shell.execute_reply": "2026-05-28T17:29:44.268429Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WORKDIR = /home/xav/code/Trace-Bench\n", + "TRACE_BENCH_REMOTE_URL = https://github.com/doxav/Trace-Bench.git\n", + "TRACE_BENCH_BRANCH = textgrad_openevolve\n", + "TRACE_BENCH_REPO = /home/xav/code/Trace-Bench\n", + "NEWTRACE_REMOTE_URL = https://github.com/doxav/NewTrace.git\n", + "NEWTRACE_BRANCH = experimental\n", + "NEWTRACE_REPO = /home/xav/code/Trace-Bench/NewTrace\n", + "OPENEVOLVE_REMOTE_URL = https://github.com/algorithmicsuperintelligence/openevolve.git\n", + "OPENEVOLVE_BRANCH = main\n", + "OPENEVOLVE_REPO = /home/xav/code/Trace-Bench/openevolve\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import subprocess\n", + "from collections.abc import Sequence\n", + "from pathlib import Path\n", + "from subprocess import CompletedProcess\n", + "\n", + "WORKDIR = Path(\"/content\") if Path(\"/content\").exists() else Path.cwd()\n", + "CURRENT_REPO = Path.cwd()\n", + "TRACE_BENCH_REMOTE_URL = \"https://github.com/doxav/Trace-Bench.git\"\n", + "TRACE_BENCH_BRANCH = \"textgrad_openevolve\"\n", + "TRACE_BENCH_REPO = CURRENT_REPO if (CURRENT_REPO / \"trace_bench\").is_dir() else WORKDIR / \"Trace-Bench\"\n", + "NEWTRACE_REMOTE_URL = \"https://github.com/doxav/NewTrace.git\"\n", + "NEWTRACE_BRANCH = \"experimental\"\n", + "NEWTRACE_REPO = WORKDIR / \"NewTrace\"\n", + "OPENEVOLVE_REMOTE_URL = \"https://github.com/algorithmicsuperintelligence/openevolve.git\"\n", + "OPENEVOLVE_BRANCH = \"main\"\n", + "OPENEVOLVE_REPO = WORKDIR / \"openevolve\"\n", + "\n", + "for repo_path in (NEWTRACE_REPO, TRACE_BENCH_REPO):\n", + " repo_path_str = str(repo_path)\n", + " if repo_path_str not in sys.path:\n", + " sys.path.insert(0, repo_path_str)\n", + "\n", + "def run(cmd: Sequence[str | os.PathLike[str]], cwd: Path | str | None = None, check: bool = True) -> CompletedProcess[bytes]:\n", + " \"\"\"Run a subprocess command and echo its argv without shell interpolation.\"\"\"\n", + " print(\"$\", \" \".join(map(str, cmd)))\n", + " return subprocess.run([str(part) for part in cmd], cwd=cwd, check=check)\n", + "\n", + "def checkout_branch(repo_path: Path, remote_url: str, branch: str) -> None:\n", + " \"\"\"Fetch, checkout, and fast-forward a branch in an existing clone.\"\"\"\n", + " run([\"git\", \"fetch\", remote_url, branch], cwd=repo_path)\n", + " checkout = run([\"git\", \"checkout\", branch], cwd=repo_path, check=False)\n", + " if checkout.returncode != 0:\n", + " run([\"git\", \"checkout\", \"-b\", branch, \"FETCH_HEAD\"], cwd=repo_path)\n", + " run([\"git\", \"pull\", \"--ff-only\", remote_url, branch], cwd=repo_path)\n", + "\n", + "print(\"WORKDIR =\", WORKDIR)\n", + "print(\"TRACE_BENCH_REMOTE_URL =\", TRACE_BENCH_REMOTE_URL)\n", + "print(\"TRACE_BENCH_BRANCH =\", TRACE_BENCH_BRANCH)\n", + "print(\"TRACE_BENCH_REPO =\", TRACE_BENCH_REPO)\n", + "print(\"NEWTRACE_REMOTE_URL =\", NEWTRACE_REMOTE_URL)\n", + "print(\"NEWTRACE_BRANCH =\", NEWTRACE_BRANCH)\n", + "print(\"NEWTRACE_REPO =\", NEWTRACE_REPO)\n", + "print(\"OPENEVOLVE_REMOTE_URL =\", OPENEVOLVE_REMOTE_URL)\n", + "print(\"OPENEVOLVE_BRANCH =\", OPENEVOLVE_BRANCH)\n", + "print(\"OPENEVOLVE_REPO =\", OPENEVOLVE_REPO)" + ] + }, + { + "cell_type": "markdown", + "id": "6d7c51fb", + "metadata": {}, + "source": [ + "## 1. Clone and checkout the repositories\n", + "\n", + "This clones:\n", + "- `Trace-Bench` on `textgrad_openevolve`\n", + "- `doxav/NewTrace` on `experimental`\n", + "- `OpenEvolve` only if the real package is missing\n", + "\n", + "Skip this if you already have local checkouts and want to point the notebook at them manually." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b6ca8593", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:44.270467Z", + "iopub.status.busy": "2026-05-28T17:29:44.270388Z", + "iopub.status.idle": "2026-05-28T17:29:46.996012Z", + "shell.execute_reply": "2026-05-28T17:29:46.995690Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trace-Bench already exists; checking out textgrad_openevolve.\n", + "$ git fetch https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "From https://github.com/doxav/Trace-Bench\n", + " * branch textgrad_openevolve -> FETCH_HEAD\n", + "Already on 'textgrad_openevolve'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ git checkout textgrad_openevolve\n", + "M\ttests/test_openevolve_trainer.py\n", + "M\ttrace_bench/trainers/openevolve_trainer.py\n", + "Your branch is up to date with 'origin/textgrad_openevolve'.\n", + "$ git pull --ff-only https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "From https://github.com/doxav/Trace-Bench\n", + " * branch textgrad_openevolve -> FETCH_HEAD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Already up to date.\n", + "NewTrace already exists; checking out experimental.\n", + "$ git fetch https://github.com/doxav/NewTrace.git experimental\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "From https://github.com/doxav/NewTrace\n", + " * branch experimental -> FETCH_HEAD\n", + "Already on 'experimental'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ git checkout experimental\n", + "Your branch is up to date with 'origin/experimental'.\n", + "$ git pull --ff-only https://github.com/doxav/NewTrace.git experimental\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Already up to date.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "From https://github.com/doxav/NewTrace\n", + " * branch experimental -> FETCH_HEAD\n" + ] + } + ], + "source": [ + "if not TRACE_BENCH_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", TRACE_BENCH_BRANCH,\n", + " \"--single-branch\",\n", + " TRACE_BENCH_REMOTE_URL,\n", + " str(TRACE_BENCH_REPO),\n", + " ])\n", + "else:\n", + " print(f\"Trace-Bench already exists; checking out {TRACE_BENCH_BRANCH}.\")\n", + " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n", + "\n", + "if not NEWTRACE_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", NEWTRACE_BRANCH,\n", + " \"--single-branch\",\n", + " NEWTRACE_REMOTE_URL,\n", + " str(NEWTRACE_REPO),\n", + " ])\n", + "else:\n", + " print(f\"NewTrace already exists; checking out {NEWTRACE_BRANCH}.\")\n", + " checkout_branch(NEWTRACE_REPO, NEWTRACE_REMOTE_URL, NEWTRACE_BRANCH)" + ] + }, + { + "cell_type": "markdown", + "id": "963c01d5", + "metadata": {}, + "source": [ + "## 2. Install Python dependencies\n", + "\n", + "This installs:\n", + "- `NewTrace` editable\n", + "- `Trace-Bench` editable\n", + "- light dependencies needed for the focused validation notebook\n", + "\n", + "If `openevolve.run_evolution` is not importable, this clones OpenEvolve from GitHub and installs it editable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fbae758b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:46.997122Z", + "iopub.status.busy": "2026-05-28T17:29:46.997049Z", + "iopub.status.idle": "2026-05-28T17:29:51.676176Z", + "shell.execute_reply": "2026-05-28T17:29:51.675385Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -U pip setuptools wheel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai tensorboard tensorboardX scikit-learn datasets openai pandas\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench/NewTrace\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench\n" + ] + } + ], + "source": [ + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"pip\", \"setuptools\", \"wheel\"])\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n", + " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\",\n", + " \"tensorboard\", \"tensorboardX\", \"scikit-learn\", \"datasets\", \"openai\", \"pandas\"])\n", + "\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(NEWTRACE_REPO)])\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(TRACE_BENCH_REPO)])\n", + "\n", + "def has_real_openevolve() -> bool:\n", + " \"\"\"Return True only when the real OpenEvolve API is importable.\"\"\"\n", + " try:\n", + " import openevolve\n", + " return callable(getattr(openevolve, \"run_evolution\", None))\n", + " except Exception:\n", + " return False\n", + "\n", + "if not has_real_openevolve():\n", + " if not OPENEVOLVE_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", OPENEVOLVE_BRANCH,\n", + " \"--single-branch\",\n", + " OPENEVOLVE_REMOTE_URL,\n", + " str(OPENEVOLVE_REPO),\n", + " ])\n", + " else:\n", + " print(f\"OpenEvolve already exists; checking out {OPENEVOLVE_BRANCH}.\")\n", + " checkout_branch(OPENEVOLVE_REPO, OPENEVOLVE_REMOTE_URL, OPENEVOLVE_BRANCH)\n", + " run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(OPENEVOLVE_REPO)])\n", + "\n", + "if not has_real_openevolve():\n", + " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "25fd9e44", + "metadata": {}, + "source": [ + "## 3. Provider setup for real online experiments\n", + "\n", + "The real smoke comparison requires this provider setup. Structural tests can still run before a provider is configured.\n", + "\n", + "Supported:\n", + "- `openrouter`\n", + "- `openai`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b984ab0", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:51.677483Z", + "iopub.status.busy": "2026-05-28T17:29:51.677372Z", + "iopub.status.idle": "2026-05-28T17:29:51.680910Z", + "shell.execute_reply": "2026-05-28T17:29:51.680574Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROVIDER = openrouter\n", + "TRACE_LITELLM_MODEL = openrouter/openai/gpt-4o-mini\n", + "OPENAI_BASE_URL = https://openrouter.ai/api/v1\n", + "OPENROUTER_API_KEY configured = True\n" + ] + } + ], + "source": [ + "from getpass import getpass\n", + "\n", + "def colab_secret(name: str) -> str:\n", + " \"\"\"Return a Colab Secret value when available, otherwise an empty string.\"\"\"\n", + " try:\n", + " from google.colab import userdata\n", + " except Exception:\n", + " return \"\"\n", + " try:\n", + " return userdata.get(name) or \"\"\n", + " except Exception:\n", + " return \"\"\n", + "\n", + "PROVIDER = \"auto\" # @param [\"auto\", \"openrouter\", \"openai\", \"none\"]\n", + "MODEL = \"\" # @param {type:\"string\"}\n", + "\n", + "openrouter_key = os.environ.get(\"OPENROUTER_API_KEY\") or colab_secret(\"OPENROUTER_API_KEY\")\n", + "openai_key = os.environ.get(\"OPENAI_API_KEY\") or colab_secret(\"OPENAI_API_KEY\")\n", + "MODEL = MODEL or os.environ.get(\"TRACE_LITELLM_MODEL\") or colab_secret(\"TRACE_LITELLM_MODEL\")\n", + "\n", + "if PROVIDER == \"auto\":\n", + " active_provider = \"openrouter\" if openrouter_key else \"openai\" if openai_key else \"none\"\n", + "else:\n", + " active_provider = PROVIDER\n", + "\n", + "if active_provider == \"openrouter\":\n", + " if not MODEL:\n", + " MODEL = \"openrouter/openai/gpt-4o-mini\"\n", + " if not openrouter_key:\n", + " openrouter_key = getpass(\"OPENROUTER_API_KEY: \")\n", + " if not openrouter_key:\n", + " raise ValueError(\"OPENROUTER_API_KEY is required when PROVIDER is openrouter.\")\n", + " os.environ[\"OPENROUTER_API_KEY\"] = openrouter_key\n", + " os.environ[\"OPENAI_API_KEY\"] = openrouter_key\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", + "elif active_provider == \"openai\":\n", + " if not MODEL:\n", + " MODEL = \"gpt-4o-mini\"\n", + " if not openai_key:\n", + " openai_key = getpass(\"OPENAI_API_KEY: \")\n", + " if not openai_key:\n", + " raise ValueError(\"OPENAI_API_KEY is required when PROVIDER is openai.\")\n", + " os.environ[\"OPENAI_API_KEY\"] = openai_key\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", + "elif active_provider == \"none\":\n", + " print(\"Skipping online provider configuration.\")\n", + "else:\n", + " raise ValueError(f\"Unsupported PROVIDER: {PROVIDER}\")\n", + "\n", + "print(\"PROVIDER =\", active_provider)\n", + "print(\"TRACE_LITELLM_MODEL =\", os.environ.get(\"TRACE_LITELLM_MODEL\"))\n", + "print(\"OPENAI_BASE_URL =\", os.environ.get(\"OPENAI_BASE_URL\"))\n", + "print(\"OPENROUTER_API_KEY configured =\", bool(os.environ.get(\"OPENROUTER_API_KEY\")))" + ] + }, + { + "cell_type": "markdown", + "id": "1a574c62", + "metadata": {}, + "source": [ + "## 4. Sanity checks and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3b4768bb", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:51.682018Z", + "iopub.status.busy": "2026-05-28T17:29:51.681957Z", + "iopub.status.idle": "2026-05-28T17:29:56.379387Z", + "shell.execute_reply": "2026-05-28T17:29:56.378979Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK: opto.optimizers.textgrad\n", + "OK: openevolve\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK: dspy\n", + "OK: trace_bench\n", + "OK: trace_bench.runner\n", + "OK: trace_bench.registry\n", + "OK: trace_bench.config\n", + "OK: trace_bench.trainers.textgrad_trainer\n", + "OK: trace_bench.trainers.openevolve_trainer\n", + "OK: trace_bench.trainers.dspy_trainer\n", + "TextGrad module: /home/xav/code/Trace-Bench/NewTrace/opto/optimizers/textgrad.py\n", + "OpenEvolve module: /home/xav/miniconda3/lib/python3.13/site-packages/openevolve/__init__.py\n", + "DSPy module: /home/xav/miniconda3/lib/python3.13/site-packages/dspy/__init__.py\n" + ] + } + ], + "source": [ + "import importlib\n", + "import pandas as pd\n", + "\n", + "def required_import(name: str) -> object:\n", + " \"\"\"Import a required module and raise a descriptive error when unavailable.\"\"\"\n", + " try:\n", + " module = importlib.import_module(name)\n", + " print(\"OK:\", name)\n", + " return module\n", + " except Exception as exc:\n", + " raise ImportError(f\"Required module is unavailable: {name}\") from exc\n", + "\n", + "textgrad_module = required_import(\"opto.optimizers.textgrad\")\n", + "openevolve_module = required_import(\"openevolve\")\n", + "dspy_module = required_import(\"dspy\")\n", + "required_import(\"trace_bench\")\n", + "required_import(\"trace_bench.runner\")\n", + "required_import(\"trace_bench.registry\")\n", + "required_import(\"trace_bench.config\")\n", + "required_import(\"trace_bench.trainers.textgrad_trainer\")\n", + "required_import(\"trace_bench.trainers.openevolve_trainer\")\n", + "required_import(\"trace_bench.trainers.dspy_trainer\")\n", + "\n", + "if not callable(getattr(textgrad_module, \"TextGrad\", None)):\n", + " raise ImportError(\"opto.optimizers.textgrad.TextGrad is required for this demo.\")\n", + "if not callable(getattr(openevolve_module, \"run_evolution\", None)):\n", + " raise ImportError(\"openevolve.run_evolution is required for this demo.\")\n", + "if not callable(getattr(dspy_module, \"LM\", None)):\n", + " raise ImportError(\"dspy.LM is required for this demo.\")\n", + "\n", + "print(\"TextGrad module:\", getattr(textgrad_module, \"__file__\", \"unknown\"))\n", + "print(\"OpenEvolve module:\", getattr(openevolve_module, \"__file__\", \"unknown\"))\n", + "print(\"DSPy module:\", getattr(dspy_module, \"__file__\", \"unknown\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4e82660b", + "metadata": {}, + "source": [ + "## 5. Focused validation commands\n", + "\n", + "These are the most relevant tests for the new trainers and their integration surface." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af508c08", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:56.380992Z", + "iopub.status.busy": "2026-05-28T17:29:56.380906Z", + "iopub.status.idle": "2026-05-28T17:29:57.560432Z", + "shell.execute_reply": "2026-05-28T17:29:57.559955Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py -q\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... [100%]\n", + "3 passed in 0.93s\n", + "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n" + ] + }, + { + "data": { + "text/plain": [ + "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TARGETED_TESTS = [\n", + " \"tests/test_resolve_external_trainers.py\",\n", + " \"tests/test_external_utils.py\",\n", + "]\n", + "\n", + "run([sys.executable, \"-m\", \"pytest\", *TARGETED_TESTS, \"-q\"], cwd=TRACE_BENCH_REPO)\n", + "run([sys.executable, \"-m\", \"py_compile\",\n", + " \"trace_bench/resolve.py\",\n", + " \"trace_bench/cli.py\",\n", + " \"trace_bench/runner.py\",\n", + " \"trace_bench/trainers/_external_utils.py\",\n", + " \"trace_bench/trainers/textgrad_trainer.py\",\n", + " \"trace_bench/trainers/openevolve_trainer.py\",\n", + " \"trace_bench/trainers/dspy_trainer.py\"], cwd=TRACE_BENCH_REPO)" + ] + }, + { + "cell_type": "markdown", + "id": "3442bf98", + "metadata": {}, + "source": [ + "## 6. Trainer discovery and signatures\n", + "\n", + "This is the fastest way to see whether the branch contains the trainer code and wires it into discovery." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c182738c", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:57.561442Z", + "iopub.status.busy": "2026-05-28T17:29:57.561358Z", + "iopub.status.idle": "2026-05-28T17:29:57.605388Z", + "shell.execute_reply": "2026-05-28T17:29:57.604880Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainer_idavailablesourceresolved_typeresolved_nameuses_trace_optimizerframework
0DSPyTrainerTruetrace_bench.trainers.dspy_trainerclassDSPyTrainerFalsedspy
1OpenEvolveTrainerTruetrace_bench.trainers.openevolve_trainerclassOpenEvolveTrainerFalseNaN
2PrioritySearchTrueopto.features.priority_search.priority_searchstrPrioritySearchNoneNaN
3TextGradTrainerTruetrace_bench.trainers.textgrad_trainerclassTextGradTrainerFalseNaN
\n", + "
" + ], + "text/plain": [ + " trainer_id available \\\n", + "0 DSPyTrainer True \n", + "1 OpenEvolveTrainer True \n", + "2 PrioritySearch True \n", + "3 TextGradTrainer True \n", + "\n", + " source resolved_type \\\n", + "0 trace_bench.trainers.dspy_trainer class \n", + "1 trace_bench.trainers.openevolve_trainer class \n", + "2 opto.features.priority_search.priority_search str \n", + "3 trace_bench.trainers.textgrad_trainer class \n", + "\n", + " resolved_name uses_trace_optimizer framework \n", + "0 DSPyTrainer False dspy \n", + "1 OpenEvolveTrainer False NaN \n", + "2 PrioritySearch None NaN \n", + "3 TextGradTrainer False NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from trace_bench.registry import discover_trainers\n", + "from trace_bench.runner import _resolve_algorithm\n", + "\n", + "trainer_rows = []\n", + "for spec in discover_trainers():\n", + " if spec.id in {\"PrioritySearch\", \"TextGradTrainer\", \"OpenEvolveTrainer\", \"DSPyTrainer\"}:\n", + " resolved = _resolve_algorithm(spec.id)\n", + " trainer_rows.append({\n", + " \"trainer_id\": spec.id,\n", + " \"available\": spec.available,\n", + " \"source\": spec.source,\n", + " \"resolved_type\": type(resolved).__name__ if not isinstance(resolved, type) else \"class\",\n", + " \"resolved_name\": getattr(resolved, \"__name__\", str(resolved)),\n", + " \"uses_trace_optimizer\": getattr(resolved, \"USES_TRACE_OPTIMIZER\", None) if isinstance(resolved, type) else None,\n", + " \"framework\": getattr(resolved, \"FRAMEWORK\", None) if isinstance(resolved, type) else None,\n", + " })\n", + "\n", + "pd.DataFrame(trainer_rows).sort_values(\"trainer_id\").reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "id": "118fce8b", + "metadata": {}, + "source": [ + "## 7. Shared helpers for train/test smoke evaluation\n", + "\n", + "The Trace, TextGrad, and OpenEvolve rows reuse `trace_examples:opentrace_train_single_node`. The DSPy row uses a tiny real `dspy.Module` with the same scalar target. Every row learns from three examples and reports held-out performance on three more examples." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8eddf6c9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:57.606741Z", + "iopub.status.busy": "2026-05-28T17:29:57.606666Z", + "iopub.status.idle": "2026-05-28T17:29:57.617692Z", + "shell.execute_reply": "2026-05-28T17:29:57.617113Z" + } + }, + "outputs": [], + "source": [ + "import re\n", + "from typing import Any\n", + "\n", + "import dspy\n", + "\n", + "from trace_bench.config import TrainerConfig\n", + "from trace_bench.registry import load_task_bundle\n", + "from trace_bench.runner import _train_bundle\n", + "from trace_bench.trainers._external_utils import apply_parameter_updates\n", + "\n", + "TRACE_TASK_ID = \"trace_examples:opentrace_train_single_node\"\n", + "TASKS_ROOT = str(TRACE_BENCH_REPO / \"LLM4AD\" / \"benchmark_tasks\")\n", + "SMOKE_INITIAL_VALUE = 0.0\n", + "SMOKE_TARGET_VALUE = 3.0\n", + "SMOKE_TRAIN_DATASET = {\n", + " \"inputs\": [\"train-a\", \"train-b\", \"train-c\"],\n", + " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n", + "}\n", + "SMOKE_TEST_DATASET = {\n", + " \"inputs\": [\"test-a\", \"test-b\", \"test-c\"],\n", + " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n", + "}\n", + "\n", + "class ScalarDSPySignature(dspy.Signature):\n", + " \"\"\"Always answer 0.\"\"\"\n", + " question: str = dspy.InputField()\n", + " answer: str = dspy.OutputField(desc=\"numeric scalar answer only\")\n", + "\n", + "class ScalarDSPyAgent(dspy.Module):\n", + " \"\"\"Tiny DSPy module for the real DSPyTrainer smoke row.\"\"\"\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " self.predict = dspy.Predict(ScalarDSPySignature)\n", + "\n", + " def forward(self, question: str) -> str:\n", + " return self.predict(question=question).answer\n", + "\n", + " @classmethod\n", + " def to_examples(cls, inputs: list[Any], infos: list[Any]) -> list[Any]:\n", + " return [\n", + " dspy.Example(question=str(task_input), answer=str(task_info), _task=task_input, _info=task_info).with_inputs(\"question\")\n", + " for task_input, task_info in zip(inputs, infos)\n", + " ]\n", + "\n", + "class ScalarDSPyGuide:\n", + " \"\"\"Score numeric DSPy answers against the scalar target.\"\"\"\n", + " def get_feedback(self, _query: Any, response: Any, reference: Any, **_kwargs: Any) -> tuple[float, str]:\n", + " text = str(getattr(response, \"data\", response)).strip()\n", + " match = re.search(r\"-?\\d+(?:\\.\\d+)?\", text)\n", + " prediction = float(match.group(0)) if match else float(\"nan\")\n", + " target = float(reference)\n", + " score = -abs(prediction - target) if prediction == prediction else -10.0\n", + " return score, f\"target={target}; response={text}\"\n", + "\n", + " def __call__(self, query: Any, response: Any, reference: Any, **kwargs: Any) -> tuple[float, str]:\n", + " return self.get_feedback(query, response, reference, **kwargs)\n", + "\n", + "def make_trace_bundle() -> dict[str, Any]:\n", + " \"\"\"Load the existing Trace-Bench scalar example bundle.\"\"\"\n", + " return load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n", + "\n", + "def _set_only_scalar_trainable(bundle: dict[str, Any]) -> None:\n", + " \"\"\"Keep the smoke focused on the existing scalar parameter.\"\"\"\n", + " param = bundle[\"param\"]\n", + " scalar = getattr(param, \"value\", None)\n", + " if scalar is None:\n", + " scalar = getattr(param, \"guess\", None)\n", + " if scalar is None:\n", + " raise AttributeError(\"Scalar smoke task requires param.value or param.guess.\")\n", + " for parameter in param.parameters():\n", + " parameter.trainable = parameter is scalar\n", + " apply_parameter_updates({scalar: SMOKE_INITIAL_VALUE})\n", + "\n", + "def make_trace_smoke_bundle() -> dict[str, Any]:\n", + " \"\"\"Build a fresh train/test smoke bundle from the Trace scalar example.\"\"\"\n", + " bundle = make_trace_bundle()\n", + " _set_only_scalar_trainable(bundle)\n", + " bundle[\"train_dataset\"] = SMOKE_TRAIN_DATASET\n", + " bundle[\"test_dataset\"] = SMOKE_TEST_DATASET\n", + " bundle.pop(\"validate_dataset\", None)\n", + " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {SMOKE_TARGET_VALUE}.\"\n", + " bundle[\"metadata\"][\"task_label\"] = \"Trace scalar\"\n", + " return bundle\n", + "\n", + "def make_dspy_lm() -> Any:\n", + " \"\"\"Build the real DSPy LM from the configured provider environment.\"\"\"\n", + " model = os.environ.get(\"TRACE_LITELLM_MODEL\") or \"gpt-4o-mini\"\n", + " if \"/\" not in model and (\"gpt\" in model.lower() or model.lower().startswith(\"o\")):\n", + " model = f\"openai/{model}\"\n", + " lm_kwargs: dict[str, Any] = {\"cache\": False}\n", + " api_base = os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\")\n", + " if api_base:\n", + " lm_kwargs[\"api_base\"] = api_base\n", + " return dspy.LM(model=model, **lm_kwargs)\n", + "\n", + "def make_dspy_smoke_bundle() -> dict[str, Any]:\n", + " \"\"\"Build a fresh train/test smoke bundle for the DSPy trainer row.\"\"\"\n", + " dspy.configure(lm=make_dspy_lm())\n", + " return {\n", + " \"param\": ScalarDSPyAgent(),\n", + " \"guide\": ScalarDSPyGuide(),\n", + " \"train_dataset\": SMOKE_TRAIN_DATASET,\n", + " \"test_dataset\": SMOKE_TEST_DATASET,\n", + " \"optimizer_kwargs\": {\"objective\": f\"Answer every scalar benchmark item with exactly {SMOKE_TARGET_VALUE}.\"},\n", + " \"metadata\": {\"task_label\": \"DSPy scalar\", \"framework\": \"dspy\"},\n", + " }\n", + "\n", + "def short_text(value: Any, limit: int = 80) -> str:\n", + " \"\"\"Return a compact display value for comparison tables.\"\"\"\n", + " text = str(value)\n", + " return text if len(text) <= limit else text[: limit - 3] + \"...\"\n", + "\n", + "def snapshot_trainable_value(bundle: dict[str, Any]) -> Any:\n", + " \"\"\"Return the current scalar value or DSPy instruction.\"\"\"\n", + " scalar = getattr(bundle[\"param\"], \"value\", None)\n", + " if scalar is None:\n", + " scalar = getattr(bundle[\"param\"], \"guess\", None)\n", + " if scalar is not None:\n", + " return getattr(scalar, \"data\", None)\n", + " predictor = getattr(bundle[\"param\"], \"predict\", None)\n", + " signature = getattr(predictor, \"signature\", None)\n", + " return short_text(getattr(signature, \"instructions\", type(bundle[\"param\"]).__name__))\n", + "\n", + "def task_label(bundle: dict[str, Any]) -> str:\n", + " \"\"\"Return the display label for a smoke bundle.\"\"\"\n", + " return str(bundle.get(\"metadata\", {}).get(\"task_label\") or bundle.get(\"metadata\", {}).get(\"benchmark\") or \"smoke\")\n", + "\n", + "def output_value(output: Any) -> Any:\n", + " \"\"\"Return a compact scalar/string output value.\"\"\"\n", + " return short_text(getattr(output, \"data\", output), limit=120)\n", + "\n", + "def score_guide(guide: Any, task_input: Any, response: Any, task_info: Any) -> tuple[float, str]:\n", + " \"\"\"Score with Trace Guide or DSPy-style get_feedback guide.\"\"\"\n", + " if callable(guide):\n", + " score, feedback = guide(task_input, response, task_info)\n", + " else:\n", + " score, feedback = guide.get_feedback(task_input, response, task_info)\n", + " return float(score), str(feedback)\n", + "\n", + "def run_train_bundle(\n", + " trainer_id: str,\n", + " params: dict[str, Any] | None = None,\n", + " mode: str = \"real\",\n", + " logger: str = \"none\",\n", + " bundle_factory: Any = make_trace_smoke_bundle,\n", + ") -> dict[str, Any]:\n", + " \"\"\"Run one trainer on the 3-example train split and score the 3-example test split.\"\"\"\n", + " bundle = bundle_factory()\n", + " params = params or {}\n", + " before = {\n", + " \"value\": snapshot_trainable_value(bundle),\n", + " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n", + " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n", + " }\n", + " result = _train_bundle(\n", + " bundle=bundle,\n", + " trainer_spec=TrainerConfig(id=trainer_id, params_variants=[params], logger=logger),\n", + " params=params,\n", + " mode=mode,\n", + " )\n", + " after = {\n", + " \"value\": snapshot_trainable_value(bundle),\n", + " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n", + " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n", + " }\n", + " return {\"trainer_id\": trainer_id, \"task\": task_label(bundle), \"mode\": mode, \"result\": result, \"before\": before, \"after\": after}\n", + "\n", + "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n", + " \"\"\"Evaluate a bundle on a dataset and retain per-example outputs.\"\"\"\n", + " inputs = dataset.get(\"inputs\") or []\n", + " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n", + " if len(inputs) != len(infos):\n", + " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n", + " if not inputs:\n", + " raise ValueError(\"Dataset must contain at least one example.\")\n", + "\n", + " rows = []\n", + " scores = []\n", + " for task_input, task_info in zip(inputs, infos):\n", + " response = output_value(bundle[\"param\"](task_input))\n", + " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n", + " scores.append(score)\n", + " rows.append({\n", + " \"input\": task_input,\n", + " \"expected\": task_info,\n", + " \"output\": response,\n", + " \"score\": score,\n", + " \"feedback\": feedback,\n", + " })\n", + " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n" + ] + }, + { + "cell_type": "markdown", + "id": "33501896", + "metadata": {}, + "source": [ + "## 8. Real train/test smoke runs\n", + "\n", + "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. They are intentionally tiny: small optimizer budgets, three training examples, and three held-out examples." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f62a8443", + "metadata": { + "execution": { + "iopub.execute_input": "2026-05-28T17:29:57.618788Z", + "iopub.status.busy": "2026-05-28T17:29:57.618723Z", + "iopub.status.idle": "2026-05-28T17:31:23.199295Z", + "shell.execute_reply": "2026-05-28T17:31:23.198801Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00._run' was never awaited\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "Evaluating agent: 0%| | 0/3 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainer_idtaskmodestatusresolved_optimizerbefore_valueafter_valuetrain_examplestest_examplesbefore_train_scoreafter_train_scoretrain_deltabefore_test_scoreafter_test_scoretest_deltaerror
0PrioritySearchTrace scalarrealokOptoPrimeV20.03.033-3.00.03.0-3.00.03.0None
1TextGradTrainerTrace scalarrealokopto.optimizers.textgrad.TextGrad0.03.033-3.00.03.0-3.00.03.0None
2OpenEvolveTrainerTrace scalarrealokopenevolve.run_evolution0.00.033-3.0-3.00.0-3.0-3.00.0None
3DSPyTrainerDSPy scalarrealokdspy.COPROAlways answer 0.Provide a consistent response of \"0\" for any a...33-3.0-3.00.0-3.0-3.00.0None
\n", + "" + ], + "text/plain": [ + " trainer_id task mode status \\\n", + "0 PrioritySearch Trace scalar real ok \n", + "1 TextGradTrainer Trace scalar real ok \n", + "2 OpenEvolveTrainer Trace scalar real ok \n", + "3 DSPyTrainer DSPy scalar real ok \n", + "\n", + " resolved_optimizer before_value \\\n", + "0 OptoPrimeV2 0.0 \n", + "1 opto.optimizers.textgrad.TextGrad 0.0 \n", + "2 openevolve.run_evolution 0.0 \n", + "3 dspy.COPRO Always answer 0. \n", + "\n", + " after_value train_examples \\\n", + "0 3.0 3 \n", + "1 3.0 3 \n", + "2 0.0 3 \n", + "3 Provide a consistent response of \"0\" for any a... 3 \n", + "\n", + " test_examples before_train_score after_train_score train_delta \\\n", + "0 3 -3.0 0.0 3.0 \n", + "1 3 -3.0 0.0 3.0 \n", + "2 3 -3.0 -3.0 0.0 \n", + "3 3 -3.0 -3.0 0.0 \n", + "\n", + " before_test_score after_test_score test_delta error \n", + "0 -3.0 0.0 3.0 None \n", + "1 -3.0 0.0 3.0 None \n", + "2 -3.0 -3.0 0.0 None \n", + "3 -3.0 -3.0 0.0 None " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainer_idtasksplitphaseexampleinputexpectedoutputscore
0DSPyTrainerDSPy scalartestafter0test-a3.00-3.0
1DSPyTrainerDSPy scalartestbefore0test-a3.00-3.0
2DSPyTrainerDSPy scalartestafter1test-b3.00-3.0
3DSPyTrainerDSPy scalartestbefore1test-b3.00-3.0
4DSPyTrainerDSPy scalartestafter2test-c3.00-3.0
5DSPyTrainerDSPy scalartestbefore2test-c3.00-3.0
6DSPyTrainerDSPy scalartrainafter0train-a3.00-3.0
7DSPyTrainerDSPy scalartrainbefore0train-a3.00-3.0
8DSPyTrainerDSPy scalartrainafter1train-b3.00-3.0
9DSPyTrainerDSPy scalartrainbefore1train-b3.00-3.0
10DSPyTrainerDSPy scalartrainafter2train-c3.00-3.0
11DSPyTrainerDSPy scalartrainbefore2train-c3.00-3.0
12OpenEvolveTrainerTrace scalartestafter0test-a3.00.0-3.0
13OpenEvolveTrainerTrace scalartestbefore0test-a3.00.0-3.0
14PrioritySearchTrace scalartestafter0test-a3.03.0-0.0
15PrioritySearchTrace scalartestbefore0test-a3.00.0-3.0
16TextGradTrainerTrace scalartestafter0test-a3.03.0-0.0
17TextGradTrainerTrace scalartestbefore0test-a3.00.0-3.0
18OpenEvolveTrainerTrace scalartestafter1test-b3.00.0-3.0
19OpenEvolveTrainerTrace scalartestbefore1test-b3.00.0-3.0
20PrioritySearchTrace scalartestafter1test-b3.03.0-0.0
21PrioritySearchTrace scalartestbefore1test-b3.00.0-3.0
22TextGradTrainerTrace scalartestafter1test-b3.03.0-0.0
23TextGradTrainerTrace scalartestbefore1test-b3.00.0-3.0
24OpenEvolveTrainerTrace scalartestafter2test-c3.00.0-3.0
25OpenEvolveTrainerTrace scalartestbefore2test-c3.00.0-3.0
26PrioritySearchTrace scalartestafter2test-c3.03.0-0.0
27PrioritySearchTrace scalartestbefore2test-c3.00.0-3.0
28TextGradTrainerTrace scalartestafter2test-c3.03.0-0.0
29TextGradTrainerTrace scalartestbefore2test-c3.00.0-3.0
30OpenEvolveTrainerTrace scalartrainafter0train-a3.00.0-3.0
31OpenEvolveTrainerTrace scalartrainbefore0train-a3.00.0-3.0
32PrioritySearchTrace scalartrainafter0train-a3.03.0-0.0
33PrioritySearchTrace scalartrainbefore0train-a3.00.0-3.0
34TextGradTrainerTrace scalartrainafter0train-a3.03.0-0.0
35TextGradTrainerTrace scalartrainbefore0train-a3.00.0-3.0
36OpenEvolveTrainerTrace scalartrainafter1train-b3.00.0-3.0
37OpenEvolveTrainerTrace scalartrainbefore1train-b3.00.0-3.0
38PrioritySearchTrace scalartrainafter1train-b3.03.0-0.0
39PrioritySearchTrace scalartrainbefore1train-b3.00.0-3.0
40TextGradTrainerTrace scalartrainafter1train-b3.03.0-0.0
41TextGradTrainerTrace scalartrainbefore1train-b3.00.0-3.0
42OpenEvolveTrainerTrace scalartrainafter2train-c3.00.0-3.0
43OpenEvolveTrainerTrace scalartrainbefore2train-c3.00.0-3.0
44PrioritySearchTrace scalartrainafter2train-c3.03.0-0.0
45PrioritySearchTrace scalartrainbefore2train-c3.00.0-3.0
46TextGradTrainerTrace scalartrainafter2train-c3.03.0-0.0
47TextGradTrainerTrace scalartrainbefore2train-c3.00.0-3.0
\n", + "
" + ], + "text/plain": [ + " trainer_id task split phase example input \\\n", + "0 DSPyTrainer DSPy scalar test after 0 test-a \n", + "1 DSPyTrainer DSPy scalar test before 0 test-a \n", + "2 DSPyTrainer DSPy scalar test after 1 test-b \n", + "3 DSPyTrainer DSPy scalar test before 1 test-b \n", + "4 DSPyTrainer DSPy scalar test after 2 test-c \n", + "5 DSPyTrainer DSPy scalar test before 2 test-c \n", + "6 DSPyTrainer DSPy scalar train after 0 train-a \n", + "7 DSPyTrainer DSPy scalar train before 0 train-a \n", + "8 DSPyTrainer DSPy scalar train after 1 train-b \n", + "9 DSPyTrainer DSPy scalar train before 1 train-b \n", + "10 DSPyTrainer DSPy scalar train after 2 train-c \n", + "11 DSPyTrainer DSPy scalar train before 2 train-c \n", + "12 OpenEvolveTrainer Trace scalar test after 0 test-a \n", + "13 OpenEvolveTrainer Trace scalar test before 0 test-a \n", + "14 PrioritySearch Trace scalar test after 0 test-a \n", + "15 PrioritySearch Trace scalar test before 0 test-a \n", + "16 TextGradTrainer Trace scalar test after 0 test-a \n", + "17 TextGradTrainer Trace scalar test before 0 test-a \n", + "18 OpenEvolveTrainer Trace scalar test after 1 test-b \n", + "19 OpenEvolveTrainer Trace scalar test before 1 test-b \n", + "20 PrioritySearch Trace scalar test after 1 test-b \n", + "21 PrioritySearch Trace scalar test before 1 test-b \n", + "22 TextGradTrainer Trace scalar test after 1 test-b \n", + "23 TextGradTrainer Trace scalar test before 1 test-b \n", + "24 OpenEvolveTrainer Trace scalar test after 2 test-c \n", + "25 OpenEvolveTrainer Trace scalar test before 2 test-c \n", + "26 PrioritySearch Trace scalar test after 2 test-c \n", + "27 PrioritySearch Trace scalar test before 2 test-c \n", + "28 TextGradTrainer Trace scalar test after 2 test-c \n", + "29 TextGradTrainer Trace scalar test before 2 test-c \n", + "30 OpenEvolveTrainer Trace scalar train after 0 train-a \n", + "31 OpenEvolveTrainer Trace scalar train before 0 train-a \n", + "32 PrioritySearch Trace scalar train after 0 train-a \n", + "33 PrioritySearch Trace scalar train before 0 train-a \n", + "34 TextGradTrainer Trace scalar train after 0 train-a \n", + "35 TextGradTrainer Trace scalar train before 0 train-a \n", + "36 OpenEvolveTrainer Trace scalar train after 1 train-b \n", + "37 OpenEvolveTrainer Trace scalar train before 1 train-b \n", + "38 PrioritySearch Trace scalar train after 1 train-b \n", + "39 PrioritySearch Trace scalar train before 1 train-b \n", + "40 TextGradTrainer Trace scalar train after 1 train-b \n", + "41 TextGradTrainer Trace scalar train before 1 train-b \n", + "42 OpenEvolveTrainer Trace scalar train after 2 train-c \n", + "43 OpenEvolveTrainer Trace scalar train before 2 train-c \n", + "44 PrioritySearch Trace scalar train after 2 train-c \n", + "45 PrioritySearch Trace scalar train before 2 train-c \n", + "46 TextGradTrainer Trace scalar train after 2 train-c \n", + "47 TextGradTrainer Trace scalar train before 2 train-c \n", + "\n", + " expected output score \n", + "0 3.0 0 -3.0 \n", + "1 3.0 0 -3.0 \n", + "2 3.0 0 -3.0 \n", + "3 3.0 0 -3.0 \n", + "4 3.0 0 -3.0 \n", + "5 3.0 0 -3.0 \n", + "6 3.0 0 -3.0 \n", + "7 3.0 0 -3.0 \n", + "8 3.0 0 -3.0 \n", + "9 3.0 0 -3.0 \n", + "10 3.0 0 -3.0 \n", + "11 3.0 0 -3.0 \n", + "12 3.0 0.0 -3.0 \n", + "13 3.0 0.0 -3.0 \n", + "14 3.0 3.0 -0.0 \n", + "15 3.0 0.0 -3.0 \n", + "16 3.0 3.0 -0.0 \n", + "17 3.0 0.0 -3.0 \n", + "18 3.0 0.0 -3.0 \n", + "19 3.0 0.0 -3.0 \n", + "20 3.0 3.0 -0.0 \n", + "21 3.0 0.0 -3.0 \n", + "22 3.0 3.0 -0.0 \n", + "23 3.0 0.0 -3.0 \n", + "24 3.0 0.0 -3.0 \n", + "25 3.0 0.0 -3.0 \n", + "26 3.0 3.0 -0.0 \n", + "27 3.0 0.0 -3.0 \n", + "28 3.0 3.0 -0.0 \n", + "29 3.0 0.0 -3.0 \n", + "30 3.0 0.0 -3.0 \n", + "31 3.0 0.0 -3.0 \n", + "32 3.0 3.0 -0.0 \n", + "33 3.0 0.0 -3.0 \n", + "34 3.0 3.0 -0.0 \n", + "35 3.0 0.0 -3.0 \n", + "36 3.0 0.0 -3.0 \n", + "37 3.0 0.0 -3.0 \n", + "38 3.0 3.0 -0.0 \n", + "39 3.0 0.0 -3.0 \n", + "40 3.0 3.0 -0.0 \n", + "41 3.0 0.0 -3.0 \n", + "42 3.0 0.0 -3.0 \n", + "43 3.0 0.0 -3.0 \n", + "44 3.0 3.0 -0.0 \n", + "45 3.0 0.0 -3.0 \n", + "46 3.0 3.0 -0.0 \n", + "47 3.0 0.0 -3.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "summary_rows = []\n", + "example_rows = []\n", + "for item in smoke_results:\n", + " if \"result\" not in item:\n", + " summary_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"mode\": item[\"mode\"],\n", + " \"status\": item[\"status\"],\n", + " \"resolved_optimizer\": None,\n", + " \"before_value\": None,\n", + " \"after_value\": None,\n", + " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n", + " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n", + " \"before_train_score\": None,\n", + " \"after_train_score\": None,\n", + " \"train_delta\": None,\n", + " \"before_test_score\": None,\n", + " \"after_test_score\": None,\n", + " \"test_delta\": None,\n", + " \"error\": item[\"error\"],\n", + " })\n", + " continue\n", + " result_status = item[\"result\"].get(\"status\")\n", + " before_train = item[\"before\"][\"train\"][\"mean_score\"]\n", + " after_train = item[\"after\"][\"train\"][\"mean_score\"]\n", + " before_test = item[\"before\"][\"test\"][\"mean_score\"]\n", + " after_test = item[\"after\"][\"test\"][\"mean_score\"]\n", + " summary_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"mode\": item[\"mode\"],\n", + " \"status\": result_status,\n", + " \"resolved_optimizer\": item[\"result\"].get(\"resolved_optimizer\"),\n", + " \"before_value\": item[\"before\"][\"value\"],\n", + " \"after_value\": item[\"after\"][\"value\"],\n", + " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n", + " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n", + " \"before_train_score\": before_train,\n", + " \"after_train_score\": after_train,\n", + " \"train_delta\": after_train - before_train,\n", + " \"before_test_score\": before_test,\n", + " \"after_test_score\": after_test,\n", + " \"test_delta\": after_test - before_test,\n", + " \"error\": item[\"result\"].get(\"error\"),\n", + " })\n", + " for split_name in (\"train\", \"test\"):\n", + " for phase in (\"before\", \"after\"):\n", + " for index, row in enumerate(item[phase][split_name][\"rows\"]):\n", + " example_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"split\": split_name,\n", + " \"phase\": phase,\n", + " \"example\": index,\n", + " \"input\": row[\"input\"],\n", + " \"expected\": row[\"expected\"],\n", + " \"output\": row[\"output\"],\n", + " \"score\": row[\"score\"],\n", + " })\n", + "\n", + "trainer_comparison = pd.DataFrame(summary_rows)\n", + "example_comparison = pd.DataFrame(example_rows)\n", + "\n", + "display(trainer_comparison)\n", + "if example_rows:\n", + " display(example_comparison.sort_values([\"task\", \"split\", \"example\", \"trainer_id\", \"phase\"]).reset_index(drop=True))\n", + "else:\n", + " print(\"No per-example outputs were produced because all real trainer runs errored.\")" + ] + }, + { + "cell_type": "markdown", + "id": "72ec5773", + "metadata": {}, + "source": [ + "## 9. Practical reading guide\n", + "\n", + "When you inspect the results, read them in this order:\n", + "\n", + "1. **Focused tests** \n", + " If these fail, the branch is not ready to trust.\n", + "\n", + "2. **Discovery table** \n", + " If `TextGradTrainer`, `OpenEvolveTrainer`, or `DSPyTrainer` are missing, the branch or optional packages are not properly present or installed.\n", + "\n", + "3. **Real train/test smoke tables** \n", + " This confirms each trainer uses the real installed package path on three train examples and three held-out examples.\n", + "\n", + "4. **Error rows** \n", + " An error row means the real trainer path failed and should be inspected before trusting comparison scores." + ] + }, + { + "cell_type": "markdown", + "id": "83656ee2", + "metadata": {}, + "source": [ + "## 10. What counts as success\n", + "\n", + "### Strong success\n", + "- focused tests pass\n", + "- discovery shows the comparison trainers\n", + "- real `opto.optimizers.textgrad.TextGrad`, `openevolve.run_evolution`, and `dspy.LM` import successfully\n", + "- real smoke rows for Trace, TextGrad, OpenEvolve, and DSPy complete without errors\n", + "\n", + "### Partial success\n", + "- focused tests pass\n", + "- structural checks pass\n", + "- one trainer reports an error row while the others complete, making the failure comparable\n", + "\n", + "### Failure\n", + "- trainers are not discovered\n", + "- focused tests fail\n", + "- DSPy is not backed by a real `dspy.LM`\n", + "- OpenEvolve path requires `exec` or unsafe parsing" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_dspy_trainer.py b/tests/test_dspy_trainer.py new file mode 100644 index 0000000..63cd74a --- /dev/null +++ b/tests/test_dspy_trainer.py @@ -0,0 +1,28 @@ +import itertools +from typing import Any + +import pytest + + +def test_dspy_trainer_restores_empty_global_lm(monkeypatch: pytest.MonkeyPatch) -> None: + dspy = pytest.importorskip("dspy") + trainer_module = pytest.importorskip("trace_bench.trainers.dspy_trainer") + from dspy.utils import DummyLM + + previous_lm = getattr(dspy.settings, "lm", None) + dspy.configure(lm=None) + trainer = trainer_module.DSPyTrainer(object()) + + def _train_inner(**_kwargs: Any) -> dict[str, str]: + return {"status": "ok"} + + monkeypatch.setattr(trainer, "_train_inner", _train_inner) + try: + trainer.train( + guide=object(), + train_dataset={"inputs": [], "infos": []}, + dspy_lm=DummyLM(itertools.cycle([{"answer": "ok"}])), + ) + assert getattr(dspy.settings, "lm", None) is None + finally: + dspy.configure(lm=previous_lm) diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py new file mode 100644 index 0000000..1f2aaa0 --- /dev/null +++ b/tests/test_llm_utils.py @@ -0,0 +1,19 @@ +import pytest + +from trace_bench.llm import openai_compatible_model_name + + +def test_openai_compatible_model_name_strips_openrouter_prefix() -> None: + assert ( + openai_compatible_model_name("openrouter/openai/gpt-4o-mini") + == "openai/gpt-4o-mini" + ) + + +def test_openai_compatible_model_name_keeps_other_model_names() -> None: + assert openai_compatible_model_name("gpt-4o-mini") == "gpt-4o-mini" + + +def test_openai_compatible_model_name_requires_string() -> None: + with pytest.raises(TypeError, match="model must be a string"): + openai_compatible_model_name(None) # type: ignore[arg-type] diff --git a/tests/test_openevolve_trainer.py b/tests/test_openevolve_trainer.py index 1c4bfbd..22d26b9 100644 --- a/tests/test_openevolve_trainer.py +++ b/tests/test_openevolve_trainer.py @@ -1,5 +1,8 @@ +import asyncio import importlib +import os import sys +import tempfile import types import pytest @@ -31,15 +34,62 @@ def __call__(self, task_input: str, response: str, task_info: str): return (1.0 if response == task_info else 0.0), f"expected {task_info}" -def _import_openevolve_trainer(monkeypatch, best_code: str): +def _install_fake_openevolve_config(monkeypatch: pytest.MonkeyPatch) -> None: + fake_config_module = types.ModuleType("openevolve.config") + + class _FakeDatabaseConfig: + def __init__(self) -> None: + self.population_size = 1000 + self.num_islands = 5 + + class _FakeLLMConfig: + def __init__(self) -> None: + self.api_base = "https://api.openai.com/v1" + self.api_key = None + self.max_tokens = 4096 + self.temperature = 0.7 + self.timeout = 60 + self.retries = 3 + self.retry_delay = 5 + self.models = [] + self.evaluator_models = [] + + class _FakeConfig: + def __init__(self, max_iterations: int, random_seed: int | None) -> None: + self.max_iterations = max_iterations + self.random_seed = random_seed + self.database = _FakeDatabaseConfig() + self.llm = _FakeLLMConfig() + + class _FakeLLMModelConfig: + def __init__(self, **kwargs: object) -> None: + self.kwargs = kwargs + + fake_config_module.Config = _FakeConfig + fake_config_module.LLMModelConfig = _FakeLLMModelConfig + monkeypatch.setitem(sys.modules, "openevolve.config", fake_config_module) + + +def _import_openevolve_trainer(monkeypatch: pytest.MonkeyPatch, best_code: str, capture: dict[str, object] | None = None) -> types.ModuleType: fake_module = types.ModuleType("openevolve") - def _run_evolution(*, initial_program, evaluator, iterations, **_kwargs): - del initial_program, evaluator, iterations + def _run_evolution(*, initial_program, evaluator, iterations, config=None, **_kwargs): + if capture is not None: + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as handle: + handle.write(best_code) + candidate_path = handle.name + try: + capture["evaluation"] = evaluator(candidate_path) + finally: + os.unlink(candidate_path) + capture["config"] = config + capture["iterations"] = iterations + del initial_program return types.SimpleNamespace(best_code=best_code) fake_module.run_evolution = _run_evolution monkeypatch.setitem(sys.modules, "openevolve", fake_module) + _install_fake_openevolve_config(monkeypatch) sys.modules.pop("trace_bench.trainers.openevolve_trainer", None) return importlib.import_module("trace_bench.trainers.openevolve_trainer") @@ -62,6 +112,26 @@ def test_openevolve_trainer_updates_parameter(monkeypatch) -> None: assert trainer.param.greeting.data == "Hello" +def test_openevolve_trainer_runs_inside_active_event_loop(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + + async def _run_training() -> dict[str, object]: + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + return trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + + result = asyncio.run(_run_training()) + assert result["status"] == "ok" + + def test_openevolve_trainer_rejects_worse_candidate(monkeypatch) -> None: trainer_module = _import_openevolve_trainer( monkeypatch, @@ -112,3 +182,28 @@ def parameters(self): mode="real", iterations=1, ) + + +def test_openevolve_trainer_returns_combined_score_and_configures_population(monkeypatch) -> None: + capture: dict = {} + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + capture=capture, + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=2, + population_size=12, + num_islands=3, + model="openrouter/openai/gpt-4o-mini", + api_key="test-key", + ensure_improvement=False, + ) + assert capture["evaluation"]["combined_score"] == capture["evaluation"]["score"] + assert capture["config"].database.population_size == 12 + assert capture["config"].database.num_islands == 3 + assert capture["config"].llm.models[0].kwargs["name"] == "openai/gpt-4o-mini" diff --git a/trace_bench/trainers/dspy_trainer.py b/trace_bench/trainers/dspy_trainer.py index 665cb02..9c1f702 100644 --- a/trace_bench/trainers/dspy_trainer.py +++ b/trace_bench/trainers/dspy_trainer.py @@ -505,7 +505,7 @@ def train( verbose=verbose, ) finally: - if resolved_lm is not None and prev_lm is not None: + if resolved_lm is not None: _dspy.configure(lm=prev_lm) def _train_inner( diff --git a/trace_bench/trainers/openevolve_trainer.py b/trace_bench/trainers/openevolve_trainer.py index 43b211a..9171e40 100644 --- a/trace_bench/trainers/openevolve_trainer.py +++ b/trace_bench/trainers/openevolve_trainer.py @@ -1,7 +1,10 @@ from __future__ import annotations import ast +import asyncio import inspect +import os +from functools import partial from pathlib import Path from pprint import pformat from threading import RLock @@ -12,6 +15,7 @@ except Exception as exc: raise ImportError("OpenEvolveTrainer requires the optional 'openevolve' package.") from exc +from trace_bench.llm import openai_compatible_model_name from trace_bench.trainers._external_utils import apply_parameter_updates, collect_trainable_parameters, coerce_like, resolve_external_trainer_base, restore_parameter_values, score_model_on_dataset, snapshot_parameter_values, summarize_feedback _TrainerBase = resolve_external_trainer_base() @@ -81,6 +85,68 @@ def _filter_supported_kwargs(function: Any, kwargs: Dict[str, Any]) -> Dict[str, return dict(kwargs) return {key: value for key, value in kwargs.items() if key in signature.parameters} +def _run_evolution_compatible(kwargs: Dict[str, Any]) -> Any: + """Run OpenEvolve even when the caller already owns an asyncio event loop.""" + try: + asyncio.get_running_loop() + except RuntimeError: + return _run_evolution(**kwargs) + + try: + import nest_asyncio + except ImportError as exc: + raise RuntimeError( + "OpenEvolveTrainer requires nest_asyncio when called from an active asyncio event loop." + ) from exc + nest_asyncio.apply() + return _run_evolution(**kwargs) + +def _build_openevolve_config(*, model: Optional[str], api_base: Optional[str], api_key: Optional[str], api_key_env: str, max_tokens: int, temperature: Optional[float], iterations: int, seed: Optional[int], population_size: Optional[int], num_islands: Optional[int]) -> Any: + """Build an OpenEvolve config for OpenAI-compatible providers when requested.""" + if api_key_env and not isinstance(api_key_env, str): + raise TypeError("api_key_env must be a string.") + if population_size is not None and population_size < 1: + raise ValueError("population_size must be at least 1.") + if num_islands is not None and num_islands < 1: + raise ValueError("num_islands must be at least 1.") + resolved_api_key = api_key or (os.environ.get(api_key_env) if api_key_env else None) + resolved_api_base = api_base or os.environ.get("OPENAI_BASE_URL") or os.environ.get("OPENAI_API_BASE") + resolved_model = model or os.environ.get("TRACE_LITELLM_MODEL") + if not any((resolved_api_key, resolved_api_base, resolved_model, population_size, num_islands)): + return None + if not resolved_model: + resolved_model = "gpt-4o-mini" + if max_tokens < 1: + raise ValueError("max_tokens must be at least 1.") + + from openevolve.config import Config, LLMModelConfig + + config = Config(max_iterations=iterations, random_seed=seed) + if population_size is not None: + config.database.population_size = population_size + if num_islands is not None: + config.database.num_islands = num_islands + if resolved_api_base: + config.llm.api_base = resolved_api_base + if resolved_api_key: + config.llm.api_key = resolved_api_key + config.llm.max_tokens = max_tokens + config.llm.temperature = temperature + model_config = LLMModelConfig( + name=openai_compatible_model_name(resolved_model), + api_base=config.llm.api_base, + api_key=config.llm.api_key, + temperature=temperature, + max_tokens=max_tokens, + timeout=config.llm.timeout, + retries=config.llm.retries, + retry_delay=config.llm.retry_delay, + random_seed=seed, + ) + config.llm.models = [model_config] + config.llm.evaluator_models = [model_config] + return config + class OpenEvolveTrainer(_TrainerBase): """Trace-Bench wrapper around OpenEvolve using safe literal parameter serialization.""" @@ -91,7 +157,7 @@ def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwa self.param = agent self.logger = logger - def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, model: Optional[str] = None, api_base: Optional[str] = None, api_key: Optional[str] = None, api_key_env: str = "OPENAI_API_KEY", max_tokens: int = 4096, temperature: Optional[float] = 0.7, output_dir: Optional[str] = None, cleanup: bool = True, **_kwargs: Any) -> Dict[str, Any]: """Optimize Trace parameters with OpenEvolve via a literal candidate mapping.""" if mode not in {"real", "stub"}: raise ValueError("mode must be either 'real' or 'stub'.") @@ -110,7 +176,7 @@ def evaluator(candidate_path: str) -> Dict[str, Any]: try: update_dict = _parse_candidate_program(program_text, parameters) except (TypeError, ValueError) as exc: - return {"score": float("-inf"), "feedback": str(exc)} + return {"score": float("-inf"), "combined_score": float("-inf"), "feedback": str(exc)} with _EVALUATION_LOCK: snapshot = snapshot_parameter_values(parameters) try: @@ -118,12 +184,15 @@ def evaluator(candidate_path: str) -> Dict[str, Any]: score, feedbacks = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) finally: restore_parameter_values(snapshot) - return {"score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}} + return {"score": score, "combined_score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}} initial_program = _serialize_candidate_program(parameters) - run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False} + config = _build_openevolve_config(model=model, api_base=api_base, api_key=api_key, api_key_env=api_key_env, max_tokens=max_tokens, temperature=temperature, iterations=iterations, seed=seed, population_size=population_size, num_islands=num_islands) + run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False, "config": config, "output_dir": output_dir, "cleanup": cleanup} filtered_kwargs = _filter_supported_kwargs(_run_evolution, {key: value for key, value in run_kwargs.items() if value is not None}) - result = _run_evolution(initial_program=initial_program, evaluator=evaluator, **filtered_kwargs) + result = _run_evolution_compatible( + {"initial_program": initial_program, "evaluator": partial(evaluator), **filtered_kwargs} + ) best_code = _extract_best_code(result) best_update = _parse_candidate_program(best_code, parameters) From 59cda2b01ff97d0e23bbbec153a5a0d63cadc454 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Wed, 3 Jun 2026 00:30:30 +0200 Subject: [PATCH 3/8] refined notebook TextGrad OpenEvolve DSPy --- ...tgrad_openevolve_evaluation_notebook.ipynb | 2201 +++++------------ tests/test_textgrad_trainer.py | 22 +- trace_bench/trainers/textgrad_trainer.py | 4 +- 3 files changed, 644 insertions(+), 1583 deletions(-) diff --git a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb index 87d0955..fb09a9c 100644 --- a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb +++ b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb @@ -9,13 +9,12 @@ "\n", "This notebook validates and compares four real trainer paths:\n", "\n", - "- `PrioritySearch` as the Trace baseline\n", + "- `PrioritySearch` as the standard Trace baseline\n", "- `TextGradTrainer`\n", "- `OpenEvolveTrainer`\n", "- `DSPyTrainer`\n", "\n", - "It checks out the `textgrad_openevolve` branch, installs the real optional packages when needed, runs focused structural checks, and then runs a tiny real train/test comparison with OpenRouter or OpenAI.\n", - "The notebook assumes the `textgrad_openevolve` branch contains the trainer integration under test." + "It checks out `textgrad_openevolve`, installs real optional packages when needed, runs focused integration checks, and runs a small real optimization demo with OpenRouter or OpenAI. The DSPy row uses a DSPy-native task; the Trace/TextGrad/OpenEvolve rows use a Trace scalar task." ] }, { @@ -27,9 +26,9 @@ "\n", "- required trainer packages import from real installations\n", "- Trace-Bench discovers the trainer classes\n", - "- focused tests and compile checks pass\n", - "- every comparison row uses three train examples and three held-out examples\n", - "- result tables show trainer status, optimizer identity, before/after scores, and per-example outputs" + "- focused trainer tests and compile checks pass\n", + "- every comparison row learns from three examples and reports three held-out examples\n", + "- result tables show before/after scores, per-example outputs, and red highlighting for rows with no held-out improvement" ] }, { @@ -39,20 +38,11 @@ "source": [ "## High-level interpretation guide\n", "\n", - "Use this notebook in three layers:\n", + "Use the notebook in this order:\n", "\n", - "1. **Code-level correctness**\n", - " - Do the new trainers exist?\n", - " - Are they discovered?\n", - " - Do focused tests pass?\n", - "\n", - "2. **Behavior-level smoke checks**\n", - " - Do the trainer paths run against real installed packages?\n", - " - Do they produce comparable before/after rows?\n", - "\n", - "3. **Practical comparison**\n", - " - Which trainers improve on the tiny task?\n", - " - Which trainers complete but do not improve in this small budget?" + "1. Confirm the branch, package imports, trainer discovery, and focused tests.\n", + "2. Compare before/after train and held-out scores.\n", + "3. Treat any red row as a real trainer run that completed or failed without held-out improvement." ] }, { @@ -61,10 +51,10 @@ "id": "56d885a1", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:44.264090Z", - "iopub.status.busy": "2026-05-28T17:29:44.263974Z", - "iopub.status.idle": "2026-05-28T17:29:44.268749Z", - "shell.execute_reply": "2026-05-28T17:29:44.268429Z" + "iopub.execute_input": "2026-06-02T16:40:11.307336Z", + "iopub.status.busy": "2026-06-02T16:40:11.307259Z", + "iopub.status.idle": "2026-06-02T16:40:11.311966Z", + "shell.execute_reply": "2026-06-02T16:40:11.311533Z" } }, "outputs": [ @@ -156,10 +146,10 @@ "id": "b6ca8593", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:44.270467Z", - "iopub.status.busy": "2026-05-28T17:29:44.270388Z", - "iopub.status.idle": "2026-05-28T17:29:46.996012Z", - "shell.execute_reply": "2026-05-28T17:29:46.995690Z" + "iopub.execute_input": "2026-06-02T16:40:11.313317Z", + "iopub.status.busy": "2026-06-02T16:40:11.313253Z", + "iopub.status.idle": "2026-06-02T16:40:13.710562Z", + "shell.execute_reply": "2026-06-02T16:40:13.710030Z" } }, "outputs": [ @@ -167,78 +157,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Trace-Bench already exists; checking out textgrad_openevolve.\n", - "$ git fetch https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "From https://github.com/doxav/Trace-Bench\n", - " * branch textgrad_openevolve -> FETCH_HEAD\n", - "Already on 'textgrad_openevolve'\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "$ git checkout textgrad_openevolve\n", - "M\ttests/test_openevolve_trainer.py\n", - "M\ttrace_bench/trainers/openevolve_trainer.py\n", - "Your branch is up to date with 'origin/textgrad_openevolve'.\n", - "$ git pull --ff-only https://github.com/doxav/Trace-Bench.git textgrad_openevolve\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "From https://github.com/doxav/Trace-Bench\n", - " * branch textgrad_openevolve -> FETCH_HEAD\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Already up to date.\n", - "NewTrace already exists; checking out experimental.\n", - "$ git fetch https://github.com/doxav/NewTrace.git experimental\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "From https://github.com/doxav/NewTrace\n", - " * branch experimental -> FETCH_HEAD\n", - "Already on 'experimental'\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "$ git checkout experimental\n", - "Your branch is up to date with 'origin/experimental'.\n", - "$ git pull --ff-only https://github.com/doxav/NewTrace.git experimental\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Already up to date.\n" + "Trace-Bench current checkout is already on textgrad_openevolve; preserving local edits.\n", + "$ git clone --branch experimental --single-branch https://github.com/doxav/NewTrace.git /home/xav/code/Trace-Bench/NewTrace\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "From https://github.com/doxav/NewTrace\n", - " * branch experimental -> FETCH_HEAD\n" + "Cloning into '/home/xav/code/Trace-Bench/NewTrace'...\n" ] } ], @@ -251,6 +178,12 @@ " TRACE_BENCH_REMOTE_URL,\n", " str(TRACE_BENCH_REPO),\n", " ])\n", + "elif TRACE_BENCH_REPO.resolve() == CURRENT_REPO.resolve():\n", + " branch = subprocess.check_output([\"git\", \"branch\", \"--show-current\"], cwd=TRACE_BENCH_REPO, text=True).strip()\n", + " if branch != TRACE_BENCH_BRANCH:\n", + " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n", + " else:\n", + " print(f\"Trace-Bench current checkout is already on {TRACE_BENCH_BRANCH}; preserving local edits.\")\n", "else:\n", " print(f\"Trace-Bench already exists; checking out {TRACE_BENCH_BRANCH}.\")\n", " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n", @@ -275,12 +208,7 @@ "source": [ "## 2. Install Python dependencies\n", "\n", - "This installs:\n", - "- `NewTrace` editable\n", - "- `Trace-Bench` editable\n", - "- light dependencies needed for the focused validation notebook\n", - "\n", - "If `openevolve.run_evolution` is not importable, this clones OpenEvolve from GitHub and installs it editable." + "This installs NewTrace and Trace-Bench editable, plus the real optional trainer packages used by the demo. If `openevolve.run_evolution` is not importable, OpenEvolve is cloned and installed editable." ] }, { @@ -289,10 +217,10 @@ "id": "fbae758b", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:46.997122Z", - "iopub.status.busy": "2026-05-28T17:29:46.997049Z", - "iopub.status.idle": "2026-05-28T17:29:51.676176Z", - "shell.execute_reply": "2026-05-28T17:29:51.675385Z" + "iopub.execute_input": "2026-06-02T16:40:13.711999Z", + "iopub.status.busy": "2026-06-02T16:40:13.711923Z", + "iopub.status.idle": "2026-06-02T16:40:20.589979Z", + "shell.execute_reply": "2026-06-02T16:40:20.589514Z" } }, "outputs": [ @@ -307,7 +235,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai tensorboard tensorboardX scikit-learn datasets openai pandas\n" + "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai optuna tensorboard tensorboardX scikit-learn datasets openai pandas\n" ] }, { @@ -328,7 +256,7 @@ "source": [ "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"pip\", \"setuptools\", \"wheel\"])\n", "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n", - " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\",\n", + " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\", \"optuna\",\n", " \"tensorboard\", \"tensorboardX\", \"scikit-learn\", \"datasets\", \"openai\", \"pandas\"])\n", "\n", "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(NEWTRACE_REPO)])\n", @@ -357,7 +285,7 @@ " run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(OPENEVOLVE_REPO)])\n", "\n", "if not has_real_openevolve():\n", - " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")\n" + " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")" ] }, { @@ -367,11 +295,7 @@ "source": [ "## 3. Provider setup for real online experiments\n", "\n", - "The real smoke comparison requires this provider setup. Structural tests can still run before a provider is configured.\n", - "\n", - "Supported:\n", - "- `openrouter`\n", - "- `openai`" + "The comparison requires a real provider. In Colab the cell reads `OPENROUTER_API_KEY` or `OPENAI_API_KEY` from Colab Secrets when present; locally it reads the same environment variables." ] }, { @@ -380,10 +304,10 @@ "id": "0b984ab0", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:51.677483Z", - "iopub.status.busy": "2026-05-28T17:29:51.677372Z", - "iopub.status.idle": "2026-05-28T17:29:51.680910Z", - "shell.execute_reply": "2026-05-28T17:29:51.680574Z" + "iopub.execute_input": "2026-06-02T16:40:20.591586Z", + "iopub.status.busy": "2026-06-02T16:40:20.591465Z", + "iopub.status.idle": "2026-06-02T16:40:20.595509Z", + "shell.execute_reply": "2026-06-02T16:40:20.595070Z" } }, "outputs": [ @@ -470,10 +394,10 @@ "id": "3b4768bb", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:51.682018Z", - "iopub.status.busy": "2026-05-28T17:29:51.681957Z", - "iopub.status.idle": "2026-05-28T17:29:56.379387Z", - "shell.execute_reply": "2026-05-28T17:29:56.378979Z" + "iopub.execute_input": "2026-06-02T16:40:20.596914Z", + "iopub.status.busy": "2026-06-02T16:40:20.596844Z", + "iopub.status.idle": "2026-06-02T16:40:25.850826Z", + "shell.execute_reply": "2026-06-02T16:40:25.850296Z" } }, "outputs": [ @@ -555,10 +479,10 @@ "id": "af508c08", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:56.380992Z", - "iopub.status.busy": "2026-05-28T17:29:56.380906Z", - "iopub.status.idle": "2026-05-28T17:29:57.560432Z", - "shell.execute_reply": "2026-05-28T17:29:57.559955Z" + "iopub.execute_input": "2026-06-02T16:40:25.852675Z", + "iopub.status.busy": "2026-06-02T16:40:25.852596Z", + "iopub.status.idle": "2026-06-02T16:40:29.170929Z", + "shell.execute_reply": "2026-06-02T16:40:29.170457Z" } }, "outputs": [ @@ -566,22 +490,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py -q\n" + "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py tests/test_llm_utils.py tests/test_textgrad_trainer.py tests/test_openevolve_trainer.py tests/test_dspy_trainer.py -q\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "................" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". [100%]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "17 passed in 2.54s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "... [100%]\n", - "3 passed in 0.93s\n", - "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n" + "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/llm.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n" ] }, { "data": { "text/plain": [ - "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)" + "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/llm.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)" ] }, "execution_count": 6, @@ -593,6 +537,10 @@ "TARGETED_TESTS = [\n", " \"tests/test_resolve_external_trainers.py\",\n", " \"tests/test_external_utils.py\",\n", + " \"tests/test_llm_utils.py\",\n", + " \"tests/test_textgrad_trainer.py\",\n", + " \"tests/test_openevolve_trainer.py\",\n", + " \"tests/test_dspy_trainer.py\",\n", "]\n", "\n", "run([sys.executable, \"-m\", \"pytest\", *TARGETED_TESTS, \"-q\"], cwd=TRACE_BENCH_REPO)\n", @@ -600,6 +548,7 @@ " \"trace_bench/resolve.py\",\n", " \"trace_bench/cli.py\",\n", " \"trace_bench/runner.py\",\n", + " \"trace_bench/llm.py\",\n", " \"trace_bench/trainers/_external_utils.py\",\n", " \"trace_bench/trainers/textgrad_trainer.py\",\n", " \"trace_bench/trainers/openevolve_trainer.py\",\n", @@ -622,10 +571,10 @@ "id": "c182738c", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:57.561442Z", - "iopub.status.busy": "2026-05-28T17:29:57.561358Z", - "iopub.status.idle": "2026-05-28T17:29:57.605388Z", - "shell.execute_reply": "2026-05-28T17:29:57.604880Z" + "iopub.execute_input": "2026-06-02T16:40:29.172741Z", + "iopub.status.busy": "2026-06-02T16:40:29.172667Z", + "iopub.status.idle": "2026-06-02T16:40:29.246810Z", + "shell.execute_reply": "2026-06-02T16:40:29.246111Z" } }, "outputs": [ @@ -755,9 +704,9 @@ "id": "118fce8b", "metadata": {}, "source": [ - "## 7. Shared helpers for train/test smoke evaluation\n", + "## 7. Shared helpers for real train/test optimization\n", "\n", - "The Trace, TextGrad, and OpenEvolve rows reuse `trace_examples:opentrace_train_single_node`. The DSPy row uses a tiny real `dspy.Module` with the same scalar target. Every row learns from three examples and reports held-out performance on three more examples." + "The Trace, TextGrad, and OpenEvolve rows use the same Trace scalar parameter task. The DSPy row uses a small routing-code task where MIPROv2 can optimize instructions from labeled examples within notebook runtime." ] }, { @@ -766,18 +715,21 @@ "id": "8eddf6c9", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:57.606741Z", - "iopub.status.busy": "2026-05-28T17:29:57.606666Z", - "iopub.status.idle": "2026-05-28T17:29:57.617692Z", - "shell.execute_reply": "2026-05-28T17:29:57.617113Z" + "iopub.execute_input": "2026-06-02T16:40:29.248580Z", + "iopub.status.busy": "2026-06-02T16:40:29.248504Z", + "iopub.status.idle": "2026-06-02T16:40:29.259545Z", + "shell.execute_reply": "2026-06-02T16:40:29.259148Z" } }, "outputs": [], "source": [ + "import contextlib\n", + "import io\n", "import re\n", - "from typing import Any\n", + "from typing import Any, Callable\n", "\n", "import dspy\n", + "from IPython.display import HTML, display\n", "\n", "from trace_bench.config import TrainerConfig\n", "from trace_bench.registry import load_task_bundle\n", @@ -786,108 +738,100 @@ "\n", "TRACE_TASK_ID = \"trace_examples:opentrace_train_single_node\"\n", "TASKS_ROOT = str(TRACE_BENCH_REPO / \"LLM4AD\" / \"benchmark_tasks\")\n", - "SMOKE_INITIAL_VALUE = 0.0\n", - "SMOKE_TARGET_VALUE = 3.0\n", - "SMOKE_TRAIN_DATASET = {\n", - " \"inputs\": [\"train-a\", \"train-b\", \"train-c\"],\n", - " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n", + "TRACE_INITIAL_VALUE = 0.0\n", + "TRACE_TARGET_VALUE = 3.0\n", + "TRACE_TRAIN_DATASET = {\"inputs\": [\"train-a\", \"train-b\", \"train-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n", + "TRACE_TEST_DATASET = {\"inputs\": [\"test-a\", \"test-b\", \"test-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n", + "DSPY_TRAIN_DATASET = {\n", + " \"inputs\": [\"customer tier scarlet\", \"customer tier azure\", \"customer tier emerald\"],\n", + " \"infos\": [\"A\", \"B\", \"C\"],\n", "}\n", - "SMOKE_TEST_DATASET = {\n", - " \"inputs\": [\"test-a\", \"test-b\", \"test-c\"],\n", - " \"infos\": [SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE, SMOKE_TARGET_VALUE],\n", + "DSPY_TEST_DATASET = {\n", + " \"inputs\": [\"routing code for scarlet ticket\", \"routing code for azure ticket\", \"routing code for emerald ticket\"],\n", + " \"infos\": [\"A\", \"B\", \"C\"],\n", "}\n", "\n", - "class ScalarDSPySignature(dspy.Signature):\n", - " \"\"\"Always answer 0.\"\"\"\n", - " question: str = dspy.InputField()\n", - " answer: str = dspy.OutputField(desc=\"numeric scalar answer only\")\n", + "class RoutingDSPySignature(dspy.Signature):\n", + " \"\"\"Return the requested routing code as a single uppercase letter.\"\"\"\n", + " ticket: str = dspy.InputField()\n", + " answer: str = dspy.OutputField(desc=\"single uppercase letter\")\n", "\n", - "class ScalarDSPyAgent(dspy.Module):\n", - " \"\"\"Tiny DSPy module for the real DSPyTrainer smoke row.\"\"\"\n", + "class RoutingDSPyAgent(dspy.Module):\n", + " \"\"\"Small DSPy module optimized through DSPyTrainer.\"\"\"\n", " def __init__(self) -> None:\n", " super().__init__()\n", - " self.predict = dspy.Predict(ScalarDSPySignature)\n", + " self.predict = dspy.Predict(RoutingDSPySignature)\n", "\n", - " def forward(self, question: str) -> str:\n", - " return self.predict(question=question).answer\n", + " def forward(self, ticket: str) -> str:\n", + " return self.predict(ticket=ticket).answer\n", "\n", " @classmethod\n", " def to_examples(cls, inputs: list[Any], infos: list[Any]) -> list[Any]:\n", " return [\n", - " dspy.Example(question=str(task_input), answer=str(task_info), _task=task_input, _info=task_info).with_inputs(\"question\")\n", - " for task_input, task_info in zip(inputs, infos)\n", + " dspy.Example(ticket=str(ticket), answer=str(code), _task=ticket, _info=code).with_inputs(\"ticket\")\n", + " for ticket, code in zip(inputs, infos)\n", " ]\n", "\n", - "class ScalarDSPyGuide:\n", - " \"\"\"Score numeric DSPy answers against the scalar target.\"\"\"\n", + "class RoutingDSPyGuide:\n", + " \"\"\"Exact-match routing-code metric for DSPy optimizers.\"\"\"\n", " def get_feedback(self, _query: Any, response: Any, reference: Any, **_kwargs: Any) -> tuple[float, str]:\n", - " text = str(getattr(response, \"data\", response)).strip()\n", - " match = re.search(r\"-?\\d+(?:\\.\\d+)?\", text)\n", - " prediction = float(match.group(0)) if match else float(\"nan\")\n", - " target = float(reference)\n", - " score = -abs(prediction - target) if prediction == prediction else -10.0\n", - " return score, f\"target={target}; response={text}\"\n", + " text = str(getattr(response, \"data\", response)).strip().upper()\n", + " match = re.search(r\"\b[A-Z]\b\", text)\n", + " prediction = match.group(0) if match else text[:1]\n", + " target = str(reference).strip().upper()\n", + " score = 1.0 if prediction == target else 0.0\n", + " return score, f\"expected={target}; response={text}\"\n", "\n", " def __call__(self, query: Any, response: Any, reference: Any, **kwargs: Any) -> tuple[float, str]:\n", " return self.get_feedback(query, response, reference, **kwargs)\n", "\n", - "def make_trace_bundle() -> dict[str, Any]:\n", - " \"\"\"Load the existing Trace-Bench scalar example bundle.\"\"\"\n", - " return load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n", - "\n", "def _set_only_scalar_trainable(bundle: dict[str, Any]) -> None:\n", - " \"\"\"Keep the smoke focused on the existing scalar parameter.\"\"\"\n", " param = bundle[\"param\"]\n", " scalar = getattr(param, \"value\", None)\n", " if scalar is None:\n", " scalar = getattr(param, \"guess\", None)\n", " if scalar is None:\n", - " raise AttributeError(\"Scalar smoke task requires param.value or param.guess.\")\n", + " raise AttributeError(\"Scalar demo task requires param.value or param.guess.\")\n", " for parameter in param.parameters():\n", " parameter.trainable = parameter is scalar\n", - " apply_parameter_updates({scalar: SMOKE_INITIAL_VALUE})\n", + " apply_parameter_updates({scalar: TRACE_INITIAL_VALUE})\n", "\n", - "def make_trace_smoke_bundle() -> dict[str, Any]:\n", - " \"\"\"Build a fresh train/test smoke bundle from the Trace scalar example.\"\"\"\n", - " bundle = make_trace_bundle()\n", + "def make_trace_demo_bundle() -> dict[str, Any]:\n", + " bundle = load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n", " _set_only_scalar_trainable(bundle)\n", - " bundle[\"train_dataset\"] = SMOKE_TRAIN_DATASET\n", - " bundle[\"test_dataset\"] = SMOKE_TEST_DATASET\n", + " bundle[\"train_dataset\"] = TRACE_TRAIN_DATASET\n", + " bundle[\"test_dataset\"] = TRACE_TEST_DATASET\n", " bundle.pop(\"validate_dataset\", None)\n", - " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {SMOKE_TARGET_VALUE}.\"\n", + " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {TRACE_TARGET_VALUE}.\"\n", " bundle[\"metadata\"][\"task_label\"] = \"Trace scalar\"\n", " return bundle\n", "\n", - "def make_dspy_lm() -> Any:\n", - " \"\"\"Build the real DSPy LM from the configured provider environment.\"\"\"\n", + "def make_dspy_lm(max_tokens: int = 200) -> Any:\n", " model = os.environ.get(\"TRACE_LITELLM_MODEL\") or \"gpt-4o-mini\"\n", " if \"/\" not in model and (\"gpt\" in model.lower() or model.lower().startswith(\"o\")):\n", " model = f\"openai/{model}\"\n", - " lm_kwargs: dict[str, Any] = {\"cache\": False}\n", + " lm_kwargs: dict[str, Any] = {\"cache\": False, \"max_tokens\": max_tokens}\n", " api_base = os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\")\n", " if api_base:\n", " lm_kwargs[\"api_base\"] = api_base\n", " return dspy.LM(model=model, **lm_kwargs)\n", "\n", - "def make_dspy_smoke_bundle() -> dict[str, Any]:\n", - " \"\"\"Build a fresh train/test smoke bundle for the DSPy trainer row.\"\"\"\n", + "def make_dspy_demo_bundle() -> dict[str, Any]:\n", " dspy.configure(lm=make_dspy_lm())\n", " return {\n", - " \"param\": ScalarDSPyAgent(),\n", - " \"guide\": ScalarDSPyGuide(),\n", - " \"train_dataset\": SMOKE_TRAIN_DATASET,\n", - " \"test_dataset\": SMOKE_TEST_DATASET,\n", - " \"optimizer_kwargs\": {\"objective\": f\"Answer every scalar benchmark item with exactly {SMOKE_TARGET_VALUE}.\"},\n", - " \"metadata\": {\"task_label\": \"DSPy scalar\", \"framework\": \"dspy\"},\n", + " \"param\": RoutingDSPyAgent(),\n", + " \"guide\": RoutingDSPyGuide(),\n", + " \"train_dataset\": DSPY_TRAIN_DATASET,\n", + " \"test_dataset\": DSPY_TEST_DATASET,\n", + " \"optimizer_kwargs\": {},\n", + " \"metadata\": {\"task_label\": \"DSPy routing code\", \"framework\": \"dspy\"},\n", " }\n", "\n", - "def short_text(value: Any, limit: int = 80) -> str:\n", - " \"\"\"Return a compact display value for comparison tables.\"\"\"\n", + "def short_text(value: Any, limit: int = 100) -> str:\n", " text = str(value)\n", " return text if len(text) <= limit else text[: limit - 3] + \"...\"\n", "\n", "def snapshot_trainable_value(bundle: dict[str, Any]) -> Any:\n", - " \"\"\"Return the current scalar value or DSPy instruction.\"\"\"\n", " scalar = getattr(bundle[\"param\"], \"value\", None)\n", " if scalar is None:\n", " scalar = getattr(bundle[\"param\"], \"guess\", None)\n", @@ -898,35 +842,47 @@ " return short_text(getattr(signature, \"instructions\", type(bundle[\"param\"]).__name__))\n", "\n", "def task_label(bundle: dict[str, Any]) -> str:\n", - " \"\"\"Return the display label for a smoke bundle.\"\"\"\n", - " return str(bundle.get(\"metadata\", {}).get(\"task_label\") or bundle.get(\"metadata\", {}).get(\"benchmark\") or \"smoke\")\n", + " metadata = bundle.get(\"metadata\", {})\n", + " return str(metadata.get(\"task_label\") or metadata.get(\"benchmark\") or \"demo\")\n", "\n", "def output_value(output: Any) -> Any:\n", - " \"\"\"Return a compact scalar/string output value.\"\"\"\n", - " return short_text(getattr(output, \"data\", output), limit=120)\n", + " return short_text(getattr(output, \"data\", output), limit=140)\n", "\n", "def score_guide(guide: Any, task_input: Any, response: Any, task_info: Any) -> tuple[float, str]:\n", - " \"\"\"Score with Trace Guide or DSPy-style get_feedback guide.\"\"\"\n", - " if callable(guide):\n", - " score, feedback = guide(task_input, response, task_info)\n", - " else:\n", - " score, feedback = guide.get_feedback(task_input, response, task_info)\n", + " score, feedback = guide(task_input, response, task_info) if callable(guide) else guide.get_feedback(task_input, response, task_info)\n", " return float(score), str(feedback)\n", "\n", + "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n", + " inputs = dataset.get(\"inputs\") or []\n", + " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n", + " if len(inputs) != len(infos):\n", + " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n", + " if not inputs:\n", + " raise ValueError(\"Dataset must contain at least one example.\")\n", + " rows = []\n", + " scores = []\n", + " for task_input, task_info in zip(inputs, infos):\n", + " response = output_value(bundle[\"param\"](task_input))\n", + " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n", + " scores.append(score)\n", + " rows.append({\"input\": task_input, \"expected\": task_info, \"output\": response, \"score\": score, \"feedback\": feedback})\n", + " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n", + "\n", "def run_train_bundle(\n", " trainer_id: str,\n", " params: dict[str, Any] | None = None,\n", " mode: str = \"real\",\n", " logger: str = \"none\",\n", - " bundle_factory: Any = make_trace_smoke_bundle,\n", + " bundle_factory: Callable[[], dict[str, Any]] = make_trace_demo_bundle,\n", ") -> dict[str, Any]:\n", - " \"\"\"Run one trainer on the 3-example train split and score the 3-example test split.\"\"\"\n", " bundle = bundle_factory()\n", " params = params or {}\n", + " train_dataset = bundle[\"train_dataset\"]\n", + " test_dataset = bundle.get(\"test_dataset\") or train_dataset\n", " before = {\n", " \"value\": snapshot_trainable_value(bundle),\n", - " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n", - " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n", + " \"train\": score_dataset(bundle, train_dataset),\n", + " \"test\": score_dataset(bundle, test_dataset),\n", " }\n", " result = _train_bundle(\n", " bundle=bundle,\n", @@ -936,34 +892,19 @@ " )\n", " after = {\n", " \"value\": snapshot_trainable_value(bundle),\n", - " \"train\": score_dataset(bundle, SMOKE_TRAIN_DATASET),\n", - " \"test\": score_dataset(bundle, SMOKE_TEST_DATASET),\n", + " \"train\": score_dataset(bundle, train_dataset),\n", + " \"test\": score_dataset(bundle, test_dataset),\n", " }\n", - " return {\"trainer_id\": trainer_id, \"task\": task_label(bundle), \"mode\": mode, \"result\": result, \"before\": before, \"after\": after}\n", - "\n", - "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n", - " \"\"\"Evaluate a bundle on a dataset and retain per-example outputs.\"\"\"\n", - " inputs = dataset.get(\"inputs\") or []\n", - " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n", - " if len(inputs) != len(infos):\n", - " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n", - " if not inputs:\n", - " raise ValueError(\"Dataset must contain at least one example.\")\n", - "\n", - " rows = []\n", - " scores = []\n", - " for task_input, task_info in zip(inputs, infos):\n", - " response = output_value(bundle[\"param\"](task_input))\n", - " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n", - " scores.append(score)\n", - " rows.append({\n", - " \"input\": task_input,\n", - " \"expected\": task_info,\n", - " \"output\": response,\n", - " \"score\": score,\n", - " \"feedback\": feedback,\n", - " })\n", - " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n" + " return {\n", + " \"trainer_id\": trainer_id,\n", + " \"task\": task_label(bundle),\n", + " \"mode\": mode,\n", + " \"result\": result,\n", + " \"before\": before,\n", + " \"after\": after,\n", + " \"train_examples\": len(train_dataset[\"inputs\"]),\n", + " \"test_examples\": len(test_dataset[\"inputs\"]),\n", + " }" ] }, { @@ -971,9 +912,9 @@ "id": "33501896", "metadata": {}, "source": [ - "## 8. Real train/test smoke runs\n", + "## 8. Real train/test optimization runs\n", "\n", - "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. They are intentionally tiny: small optimizer budgets, three training examples, and three held-out examples." + "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. Optimizer logs are captured so the notebook output stays focused on the comparison tables." ] }, { @@ -982,10 +923,10 @@ "id": "f62a8443", "metadata": { "execution": { - "iopub.execute_input": "2026-05-28T17:29:57.618788Z", - "iopub.status.busy": "2026-05-28T17:29:57.618723Z", - "iopub.status.idle": "2026-05-28T17:31:23.199295Z", - "shell.execute_reply": "2026-05-28T17:31:23.198801Z" + "iopub.execute_input": "2026-06-02T16:40:29.261316Z", + "iopub.status.busy": "2026-06-02T16:40:29.261235Z", + "iopub.status.idle": "2026-06-02T16:44:58.092383Z", + "shell.execute_reply": "2026-06-02T16:44:58.091779Z" } }, "outputs": [ @@ -993,1166 +934,244 @@ "name": "stdout", "output_type": "stream", "text": [ - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with only long-term memory.\n", - "Epoch: 0. Iteration: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "Sampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00._run' was never awaited\n", - " with concurrent.futures.ThreadPoolExecutor() as executor:\n", - "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "Evaluating agent: 0%| | 0/3 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 trainer_idtaskmodestatusresolved_optimizerbefore_valueafter_valuetrain_examplestest_examplesbefore_train_scoreafter_train_scoretrain_deltabefore_test_scoreafter_test_scoretest_deltaimprovementerror
0PrioritySearchTrace scalarrealokOptoPrimeV20.0000003.00000033-3.0000.0003.000-3.0000.0003.000YESNone
1TextGradTrainerTrace scalarrealokopto.optimizers.textgrad.TextGrad0.0000003.00000033-3.0000.0003.000-3.0000.0003.000YESNone
2OpenEvolveTrainerTrace scalarrealokopenevolve.run_evolution0.0000001.00000033-3.000-2.0001.000-3.000-2.0001.000YESNone
3DSPyTrainerDSPy routing coderealokdspy.MIPROv2Return the requested routing code as a single uppercase letter.Based on the customer tier mentioned in the ticket (e.g., \"customer tier scarlet\", \"customer tier...330.0001.0001.0000.0001.0001.000YESNone
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "Validating newly proposed candidates: Sampling 4 agents on 1 inputs: 0%| | 0/4 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
trainer_idtaskmodestatusresolved_optimizerbefore_valueafter_valuetrain_examplestest_examplesbefore_train_scoreafter_train_scoretrain_deltabefore_test_scoreafter_test_scoretest_deltaerror
0PrioritySearchTrace scalarrealokOptoPrimeV20.03.033-3.00.03.0-3.00.03.0None
1TextGradTrainerTrace scalarrealokopto.optimizers.textgrad.TextGrad0.03.033-3.00.03.0-3.00.03.0None
2OpenEvolveTrainerTrace scalarrealokopenevolve.run_evolution0.00.033-3.0-3.00.0-3.0-3.00.0None
3DSPyTrainerDSPy scalarrealokdspy.COPROAlways answer 0.Provide a consistent response of \"0\" for any a...33-3.0-3.00.0-3.0-3.00.0None
\n", - "" - ], - "text/plain": [ - " trainer_id task mode status \\\n", - "0 PrioritySearch Trace scalar real ok \n", - "1 TextGradTrainer Trace scalar real ok \n", - "2 OpenEvolveTrainer Trace scalar real ok \n", - "3 DSPyTrainer DSPy scalar real ok \n", - "\n", - " resolved_optimizer before_value \\\n", - "0 OptoPrimeV2 0.0 \n", - "1 opto.optimizers.textgrad.TextGrad 0.0 \n", - "2 openevolve.run_evolution 0.0 \n", - "3 dspy.COPRO Always answer 0. \n", - "\n", - " after_value train_examples \\\n", - "0 3.0 3 \n", - "1 3.0 3 \n", - "2 0.0 3 \n", - "3 Provide a consistent response of \"0\" for any a... 3 \n", - "\n", - " test_examples before_train_score after_train_score train_delta \\\n", - "0 3 -3.0 0.0 3.0 \n", - "1 3 -3.0 0.0 3.0 \n", - "2 3 -3.0 -3.0 0.0 \n", - "3 3 -3.0 -3.0 0.0 \n", - "\n", - " before_test_score after_test_score test_delta error \n", - "0 -3.0 0.0 3.0 None \n", - "1 -3.0 0.0 3.0 None \n", - "2 -3.0 -3.0 0.0 None \n", - "3 -3.0 -3.0 0.0 None " - ] - }, - "metadata": {}, - "output_type": "display_data" + "data": { + "text/html": [ + "
All trainers improved on held-out examples.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { @@ -2190,146 +1209,146 @@ " \n", " 0\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " after\n", " 0\n", - " test-a\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for scarlet ticket\n", + " A\n", + " A\n", + " 1.0\n", " \n", " \n", " 1\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " before\n", " 0\n", - " test-a\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for scarlet ticket\n", + " A\n", + " S\n", + " 0.0\n", " \n", " \n", " 2\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " after\n", " 1\n", - " test-b\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for azure ticket\n", + " B\n", + " B\n", + " 1.0\n", " \n", " \n", " 3\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " before\n", " 1\n", - " test-b\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for azure ticket\n", + " B\n", + " A\n", + " 0.0\n", " \n", " \n", " 4\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " after\n", " 2\n", - " test-c\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for emerald ticket\n", + " C\n", + " C\n", + " 1.0\n", " \n", " \n", " 5\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " test\n", " before\n", " 2\n", - " test-c\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " routing code for emerald ticket\n", + " C\n", + " E\n", + " 0.0\n", " \n", " \n", " 6\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " after\n", " 0\n", - " train-a\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier scarlet\n", + " A\n", + " A\n", + " 1.0\n", " \n", " \n", " 7\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " before\n", " 0\n", - " train-a\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier scarlet\n", + " A\n", + " S\n", + " 0.0\n", " \n", " \n", " 8\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " after\n", " 1\n", - " train-b\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier azure\n", + " B\n", + " B\n", + " 1.0\n", " \n", " \n", " 9\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " before\n", " 1\n", - " train-b\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier azure\n", + " B\n", + " A\n", + " 0.0\n", " \n", " \n", " 10\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " after\n", " 2\n", - " train-c\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier emerald\n", + " C\n", + " C\n", + " 1.0\n", " \n", " \n", " 11\n", " DSPyTrainer\n", - " DSPy scalar\n", + " DSPy routing code\n", " train\n", " before\n", " 2\n", - " train-c\n", - " 3.0\n", - " 0\n", - " -3.0\n", + " customer tier emerald\n", + " C\n", + " E\n", + " 0.0\n", " \n", " \n", " 12\n", @@ -2340,8 +1359,8 @@ " 0\n", " test-a\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 13\n", @@ -2412,8 +1431,8 @@ " 1\n", " test-b\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 19\n", @@ -2484,8 +1503,8 @@ " 2\n", " test-c\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 25\n", @@ -2556,8 +1575,8 @@ " 0\n", " train-a\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 31\n", @@ -2628,8 +1647,8 @@ " 1\n", " train-b\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 37\n", @@ -2700,8 +1719,8 @@ " 2\n", " train-c\n", " 3.0\n", - " 0.0\n", - " -3.0\n", + " 1.0\n", + " -2.0\n", " \n", " \n", " 43\n", @@ -2768,109 +1787,119 @@ "" ], "text/plain": [ - " trainer_id task split phase example input \\\n", - "0 DSPyTrainer DSPy scalar test after 0 test-a \n", - "1 DSPyTrainer DSPy scalar test before 0 test-a \n", - "2 DSPyTrainer DSPy scalar test after 1 test-b \n", - "3 DSPyTrainer DSPy scalar test before 1 test-b \n", - "4 DSPyTrainer DSPy scalar test after 2 test-c \n", - "5 DSPyTrainer DSPy scalar test before 2 test-c \n", - "6 DSPyTrainer DSPy scalar train after 0 train-a \n", - "7 DSPyTrainer DSPy scalar train before 0 train-a \n", - "8 DSPyTrainer DSPy scalar train after 1 train-b \n", - "9 DSPyTrainer DSPy scalar train before 1 train-b \n", - "10 DSPyTrainer DSPy scalar train after 2 train-c \n", - "11 DSPyTrainer DSPy scalar train before 2 train-c \n", - "12 OpenEvolveTrainer Trace scalar test after 0 test-a \n", - "13 OpenEvolveTrainer Trace scalar test before 0 test-a \n", - "14 PrioritySearch Trace scalar test after 0 test-a \n", - "15 PrioritySearch Trace scalar test before 0 test-a \n", - "16 TextGradTrainer Trace scalar test after 0 test-a \n", - "17 TextGradTrainer Trace scalar test before 0 test-a \n", - "18 OpenEvolveTrainer Trace scalar test after 1 test-b \n", - "19 OpenEvolveTrainer Trace scalar test before 1 test-b \n", - "20 PrioritySearch Trace scalar test after 1 test-b \n", - "21 PrioritySearch Trace scalar test before 1 test-b \n", - "22 TextGradTrainer Trace scalar test after 1 test-b \n", - "23 TextGradTrainer Trace scalar test before 1 test-b \n", - "24 OpenEvolveTrainer Trace scalar test after 2 test-c \n", - "25 OpenEvolveTrainer Trace scalar test before 2 test-c \n", - "26 PrioritySearch Trace scalar test after 2 test-c \n", - "27 PrioritySearch Trace scalar test before 2 test-c \n", - "28 TextGradTrainer Trace scalar test after 2 test-c \n", - "29 TextGradTrainer Trace scalar test before 2 test-c \n", - "30 OpenEvolveTrainer Trace scalar train after 0 train-a \n", - "31 OpenEvolveTrainer Trace scalar train before 0 train-a \n", - "32 PrioritySearch Trace scalar train after 0 train-a \n", - "33 PrioritySearch Trace scalar train before 0 train-a \n", - "34 TextGradTrainer Trace scalar train after 0 train-a \n", - "35 TextGradTrainer Trace scalar train before 0 train-a \n", - "36 OpenEvolveTrainer Trace scalar train after 1 train-b \n", - "37 OpenEvolveTrainer Trace scalar train before 1 train-b \n", - "38 PrioritySearch Trace scalar train after 1 train-b \n", - "39 PrioritySearch Trace scalar train before 1 train-b \n", - "40 TextGradTrainer Trace scalar train after 1 train-b \n", - "41 TextGradTrainer Trace scalar train before 1 train-b \n", - "42 OpenEvolveTrainer Trace scalar train after 2 train-c \n", - "43 OpenEvolveTrainer Trace scalar train before 2 train-c \n", - "44 PrioritySearch Trace scalar train after 2 train-c \n", - "45 PrioritySearch Trace scalar train before 2 train-c \n", - "46 TextGradTrainer Trace scalar train after 2 train-c \n", - "47 TextGradTrainer Trace scalar train before 2 train-c \n", + " trainer_id task split phase example \\\n", + "0 DSPyTrainer DSPy routing code test after 0 \n", + "1 DSPyTrainer DSPy routing code test before 0 \n", + "2 DSPyTrainer DSPy routing code test after 1 \n", + "3 DSPyTrainer DSPy routing code test before 1 \n", + "4 DSPyTrainer DSPy routing code test after 2 \n", + "5 DSPyTrainer DSPy routing code test before 2 \n", + "6 DSPyTrainer DSPy routing code train after 0 \n", + "7 DSPyTrainer DSPy routing code train before 0 \n", + "8 DSPyTrainer DSPy routing code train after 1 \n", + "9 DSPyTrainer DSPy routing code train before 1 \n", + "10 DSPyTrainer DSPy routing code train after 2 \n", + "11 DSPyTrainer DSPy routing code train before 2 \n", + "12 OpenEvolveTrainer Trace scalar test after 0 \n", + "13 OpenEvolveTrainer Trace scalar test before 0 \n", + "14 PrioritySearch Trace scalar test after 0 \n", + "15 PrioritySearch Trace scalar test before 0 \n", + "16 TextGradTrainer Trace scalar test after 0 \n", + "17 TextGradTrainer Trace scalar test before 0 \n", + "18 OpenEvolveTrainer Trace scalar test after 1 \n", + "19 OpenEvolveTrainer Trace scalar test before 1 \n", + "20 PrioritySearch Trace scalar test after 1 \n", + "21 PrioritySearch Trace scalar test before 1 \n", + "22 TextGradTrainer Trace scalar test after 1 \n", + "23 TextGradTrainer Trace scalar test before 1 \n", + "24 OpenEvolveTrainer Trace scalar test after 2 \n", + "25 OpenEvolveTrainer Trace scalar test before 2 \n", + "26 PrioritySearch Trace scalar test after 2 \n", + "27 PrioritySearch Trace scalar test before 2 \n", + "28 TextGradTrainer Trace scalar test after 2 \n", + "29 TextGradTrainer Trace scalar test before 2 \n", + "30 OpenEvolveTrainer Trace scalar train after 0 \n", + "31 OpenEvolveTrainer Trace scalar train before 0 \n", + "32 PrioritySearch Trace scalar train after 0 \n", + "33 PrioritySearch Trace scalar train before 0 \n", + "34 TextGradTrainer Trace scalar train after 0 \n", + "35 TextGradTrainer Trace scalar train before 0 \n", + "36 OpenEvolveTrainer Trace scalar train after 1 \n", + "37 OpenEvolveTrainer Trace scalar train before 1 \n", + "38 PrioritySearch Trace scalar train after 1 \n", + "39 PrioritySearch Trace scalar train before 1 \n", + "40 TextGradTrainer Trace scalar train after 1 \n", + "41 TextGradTrainer Trace scalar train before 1 \n", + "42 OpenEvolveTrainer Trace scalar train after 2 \n", + "43 OpenEvolveTrainer Trace scalar train before 2 \n", + "44 PrioritySearch Trace scalar train after 2 \n", + "45 PrioritySearch Trace scalar train before 2 \n", + "46 TextGradTrainer Trace scalar train after 2 \n", + "47 TextGradTrainer Trace scalar train before 2 \n", "\n", - " expected output score \n", - "0 3.0 0 -3.0 \n", - "1 3.0 0 -3.0 \n", - "2 3.0 0 -3.0 \n", - "3 3.0 0 -3.0 \n", - "4 3.0 0 -3.0 \n", - "5 3.0 0 -3.0 \n", - "6 3.0 0 -3.0 \n", - "7 3.0 0 -3.0 \n", - "8 3.0 0 -3.0 \n", - "9 3.0 0 -3.0 \n", - "10 3.0 0 -3.0 \n", - "11 3.0 0 -3.0 \n", - "12 3.0 0.0 -3.0 \n", - "13 3.0 0.0 -3.0 \n", - "14 3.0 3.0 -0.0 \n", - "15 3.0 0.0 -3.0 \n", - "16 3.0 3.0 -0.0 \n", - "17 3.0 0.0 -3.0 \n", - "18 3.0 0.0 -3.0 \n", - "19 3.0 0.0 -3.0 \n", - "20 3.0 3.0 -0.0 \n", - "21 3.0 0.0 -3.0 \n", - "22 3.0 3.0 -0.0 \n", - "23 3.0 0.0 -3.0 \n", - "24 3.0 0.0 -3.0 \n", - "25 3.0 0.0 -3.0 \n", - "26 3.0 3.0 -0.0 \n", - "27 3.0 0.0 -3.0 \n", - "28 3.0 3.0 -0.0 \n", - "29 3.0 0.0 -3.0 \n", - "30 3.0 0.0 -3.0 \n", - "31 3.0 0.0 -3.0 \n", - "32 3.0 3.0 -0.0 \n", - "33 3.0 0.0 -3.0 \n", - "34 3.0 3.0 -0.0 \n", - "35 3.0 0.0 -3.0 \n", - "36 3.0 0.0 -3.0 \n", - "37 3.0 0.0 -3.0 \n", - "38 3.0 3.0 -0.0 \n", - "39 3.0 0.0 -3.0 \n", - "40 3.0 3.0 -0.0 \n", - "41 3.0 0.0 -3.0 \n", - "42 3.0 0.0 -3.0 \n", - "43 3.0 0.0 -3.0 \n", - "44 3.0 3.0 -0.0 \n", - "45 3.0 0.0 -3.0 \n", - "46 3.0 3.0 -0.0 \n", - "47 3.0 0.0 -3.0 " + " input expected output score \n", + "0 routing code for scarlet ticket A A 1.0 \n", + "1 routing code for scarlet ticket A S 0.0 \n", + "2 routing code for azure ticket B B 1.0 \n", + "3 routing code for azure ticket B A 0.0 \n", + "4 routing code for emerald ticket C C 1.0 \n", + "5 routing code for emerald ticket C E 0.0 \n", + "6 customer tier scarlet A A 1.0 \n", + "7 customer tier scarlet A S 0.0 \n", + "8 customer tier azure B B 1.0 \n", + "9 customer tier azure B A 0.0 \n", + "10 customer tier emerald C C 1.0 \n", + "11 customer tier emerald C E 0.0 \n", + "12 test-a 3.0 1.0 -2.0 \n", + "13 test-a 3.0 0.0 -3.0 \n", + "14 test-a 3.0 3.0 -0.0 \n", + "15 test-a 3.0 0.0 -3.0 \n", + "16 test-a 3.0 3.0 -0.0 \n", + "17 test-a 3.0 0.0 -3.0 \n", + "18 test-b 3.0 1.0 -2.0 \n", + "19 test-b 3.0 0.0 -3.0 \n", + "20 test-b 3.0 3.0 -0.0 \n", + "21 test-b 3.0 0.0 -3.0 \n", + "22 test-b 3.0 3.0 -0.0 \n", + "23 test-b 3.0 0.0 -3.0 \n", + "24 test-c 3.0 1.0 -2.0 \n", + "25 test-c 3.0 0.0 -3.0 \n", + "26 test-c 3.0 3.0 -0.0 \n", + "27 test-c 3.0 0.0 -3.0 \n", + "28 test-c 3.0 3.0 -0.0 \n", + "29 test-c 3.0 0.0 -3.0 \n", + "30 train-a 3.0 1.0 -2.0 \n", + "31 train-a 3.0 0.0 -3.0 \n", + "32 train-a 3.0 3.0 -0.0 \n", + "33 train-a 3.0 0.0 -3.0 \n", + "34 train-a 3.0 3.0 -0.0 \n", + "35 train-a 3.0 0.0 -3.0 \n", + "36 train-b 3.0 1.0 -2.0 \n", + "37 train-b 3.0 0.0 -3.0 \n", + "38 train-b 3.0 3.0 -0.0 \n", + "39 train-b 3.0 0.0 -3.0 \n", + "40 train-b 3.0 3.0 -0.0 \n", + "41 train-b 3.0 0.0 -3.0 \n", + "42 train-c 3.0 1.0 -2.0 \n", + "43 train-c 3.0 0.0 -3.0 \n", + "44 train-c 3.0 3.0 -0.0 \n", + "45 train-c 3.0 0.0 -3.0 \n", + "46 train-c 3.0 3.0 -0.0 \n", + "47 train-c 3.0 0.0 -3.0 " ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PrioritySearch: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n", + "TextGradTrainer: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n", + "OpenEvolveTrainer: -3.0 -> -2.0 (test_delta=1.0, improvement=YES)\n", + "DSPyTrainer: 0.0 -> 1.0 (test_delta=1.0, improvement=YES)\n" + ] } ], "source": [ @@ -2886,22 +1915,26 @@ " \"resolved_optimizer\": None,\n", " \"before_value\": None,\n", " \"after_value\": None,\n", - " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n", - " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n", + " \"train_examples\": None,\n", + " \"test_examples\": None,\n", " \"before_train_score\": None,\n", " \"after_train_score\": None,\n", " \"train_delta\": None,\n", " \"before_test_score\": None,\n", " \"after_test_score\": None,\n", " \"test_delta\": None,\n", + " \"improvement\": \"NO\",\n", " \"error\": item[\"error\"],\n", " })\n", " continue\n", + "\n", " result_status = item[\"result\"].get(\"status\")\n", " before_train = item[\"before\"][\"train\"][\"mean_score\"]\n", " after_train = item[\"after\"][\"train\"][\"mean_score\"]\n", " before_test = item[\"before\"][\"test\"][\"mean_score\"]\n", " after_test = item[\"after\"][\"test\"][\"mean_score\"]\n", + " test_delta = after_test - before_test\n", + " improved = result_status == \"ok\" and test_delta > 0\n", " summary_rows.append({\n", " \"trainer_id\": item[\"trainer_id\"],\n", " \"task\": item[\"task\"],\n", @@ -2910,14 +1943,15 @@ " \"resolved_optimizer\": item[\"result\"].get(\"resolved_optimizer\"),\n", " \"before_value\": item[\"before\"][\"value\"],\n", " \"after_value\": item[\"after\"][\"value\"],\n", - " \"train_examples\": len(SMOKE_TRAIN_DATASET[\"inputs\"]),\n", - " \"test_examples\": len(SMOKE_TEST_DATASET[\"inputs\"]),\n", + " \"train_examples\": item[\"train_examples\"],\n", + " \"test_examples\": item[\"test_examples\"],\n", " \"before_train_score\": before_train,\n", " \"after_train_score\": after_train,\n", " \"train_delta\": after_train - before_train,\n", " \"before_test_score\": before_test,\n", " \"after_test_score\": after_test,\n", - " \"test_delta\": after_test - before_test,\n", + " \"test_delta\": test_delta,\n", + " \"improvement\": \"YES\" if improved else \"NO\",\n", " \"error\": item[\"result\"].get(\"error\"),\n", " })\n", " for split_name in (\"train\", \"test\"):\n", @@ -2938,11 +1972,35 @@ "trainer_comparison = pd.DataFrame(summary_rows)\n", "example_comparison = pd.DataFrame(example_rows)\n", "\n", - "display(trainer_comparison)\n", + "score_columns = [\"before_train_score\", \"after_train_score\", \"train_delta\", \"before_test_score\", \"after_test_score\", \"test_delta\"]\n", + "def mark_no_improvement(row: pd.Series) -> list[str]:\n", + " style = \"background-color: #ffd6d6; color: #9f0000; font-weight: 700\"\n", + " return [style if row.get(\"improvement\") != \"YES\" else \"\" for _ in row]\n", + "\n", + "styled_comparison = (\n", + " trainer_comparison.style\n", + " .apply(mark_no_improvement, axis=1)\n", + " .format({column: \"{:.3f}\" for column in score_columns})\n", + ")\n", + "display(styled_comparison)\n", + "\n", + "no_improvement = trainer_comparison[trainer_comparison[\"improvement\"] != \"YES\"]\n", + "if no_improvement.empty:\n", + " display(HTML(\"
All trainers improved on held-out examples.
\"))\n", + "else:\n", + " names = \", \".join(no_improvement[\"trainer_id\"].astype(str).tolist())\n", + " display(HTML(f\"
NO HELD-OUT IMPROVEMENT: {names}
\"))\n", + "\n", "if example_rows:\n", " display(example_comparison.sort_values([\"task\", \"split\", \"example\", \"trainer_id\", \"phase\"]).reset_index(drop=True))\n", "else:\n", - " print(\"No per-example outputs were produced because all real trainer runs errored.\")" + " print(\"No per-example outputs were produced because all real trainer runs errored.\")\n", + "\n", + "for _, row in trainer_comparison.iterrows():\n", + " print(\n", + " f\"{row['trainer_id']}: {row['before_test_score']} -> {row['after_test_score']} \"\n", + " f\"(test_delta={row['test_delta']}, improvement={row['improvement']})\"\n", + " )" ] }, { @@ -2952,19 +2010,7 @@ "source": [ "## 9. Practical reading guide\n", "\n", - "When you inspect the results, read them in this order:\n", - "\n", - "1. **Focused tests** \n", - " If these fail, the branch is not ready to trust.\n", - "\n", - "2. **Discovery table** \n", - " If `TextGradTrainer`, `OpenEvolveTrainer`, or `DSPyTrainer` are missing, the branch or optional packages are not properly present or installed.\n", - "\n", - "3. **Real train/test smoke tables** \n", - " This confirms each trainer uses the real installed package path on three train examples and three held-out examples.\n", - "\n", - "4. **Error rows** \n", - " An error row means the real trainer path failed and should be inspected before trusting comparison scores." + "Read red rows first. A red row means the real trainer path either failed or did not improve the held-out score. Then inspect the per-example table to see whether the trainer changed the parameter/instruction and whether the change generalized beyond the three training examples." ] }, { @@ -2978,18 +2024,13 @@ "- focused tests pass\n", "- discovery shows the comparison trainers\n", "- real `opto.optimizers.textgrad.TextGrad`, `openevolve.run_evolution`, and `dspy.LM` import successfully\n", - "- real smoke rows for Trace, TextGrad, OpenEvolve, and DSPy complete without errors\n", + "- all real trainer rows complete\n", + "- no rows are highlighted red\n", "\n", - "### Partial success\n", - "- focused tests pass\n", - "- structural checks pass\n", - "- one trainer reports an error row while the others complete, making the failure comparable\n", - "\n", - "### Failure\n", - "- trainers are not discovered\n", - "- focused tests fail\n", - "- DSPy is not backed by a real `dspy.LM`\n", - "- OpenEvolve path requires `exec` or unsafe parsing" + "### Needs follow-up\n", + "- a trainer reports an error row\n", + "- a trainer completes but is highlighted red because held-out score did not improve\n", + "- per-example outputs show memorization or no meaningful parameter/instruction change" ] } ], diff --git a/tests/test_textgrad_trainer.py b/tests/test_textgrad_trainer.py index 221f84c..f1ccc41 100644 --- a/tests/test_textgrad_trainer.py +++ b/tests/test_textgrad_trainer.py @@ -31,12 +31,14 @@ def __call__(self, task_input: str, response: str, task_info: str): return (1.0 if response == task_info else 0.0), f"expected {task_info}" -def _import_textgrad_trainer(monkeypatch, proposal: str): +def _import_textgrad_trainer(monkeypatch: pytest.MonkeyPatch, proposal: str, capture: dict[str, object] | None = None) -> types.ModuleType: fake_module = types.ModuleType("opto.optimizers.textgrad") class _FakeTextGrad: def __init__(self, parameters, **_kwargs) -> None: self.parameters = list(parameters) + if capture is not None: + capture["init_kwargs"] = _kwargs def zero_feedback(self) -> None: return None @@ -99,3 +101,21 @@ def parameters(self): train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, mode="real", ) + + +def test_textgrad_trainer_forwards_llm(monkeypatch: pytest.MonkeyPatch) -> None: + """TextGradTrainer forwards explicit LLM objects to NewTrace TextGrad.""" + capture: dict[str, object] = {} + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello", capture=capture) + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi")) + llm = object() + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + ensure_improvement=False, + llm=llm, + ) + init_kwargs = capture["init_kwargs"] + assert isinstance(init_kwargs, dict) + assert init_kwargs["llm"] is llm diff --git a/trace_bench/trainers/textgrad_trainer.py b/trace_bench/trainers/textgrad_trainer.py index a8eb5e0..4bad679 100644 --- a/trace_bench/trainers/textgrad_trainer.py +++ b/trace_bench/trainers/textgrad_trainer.py @@ -51,7 +51,7 @@ def _standard_optimization_step(self, guide: Any, task_input: Any, task_info: An target = exc.exception_node return target, float(min_score), target.create_feedback("full") - def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, llm: Any = None, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: """Optimize Trace parameters with the TextGrad optimizer provided by NewTrace.""" if mode not in {"real", "stub"}: raise ValueError("mode must be either 'real' or 'stub'.") @@ -70,7 +70,7 @@ def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real" if not inputs: raise ValueError("train_dataset must contain at least one example.") - optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens) + optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens, llm=llm) for _ in range(num_epochs): for start in range(0, len(inputs), batch_size): batch_inputs = inputs[start : start + batch_size] From 5406dd658f5500550a95e79661118336f3eda137 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Wed, 3 Jun 2026 00:34:49 +0200 Subject: [PATCH 4/8] prevent modifying dspy_trainer.py --- trace_bench/trainers/dspy_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trace_bench/trainers/dspy_trainer.py b/trace_bench/trainers/dspy_trainer.py index 9c1f702..665cb02 100644 --- a/trace_bench/trainers/dspy_trainer.py +++ b/trace_bench/trainers/dspy_trainer.py @@ -505,7 +505,7 @@ def train( verbose=verbose, ) finally: - if resolved_lm is not None: + if resolved_lm is not None and prev_lm is not None: _dspy.configure(lm=prev_lm) def _train_inner( From 638bc58f785139ac2a1b51de403bcd8110ea6539 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Thu, 11 Jun 2026 08:33:31 +0200 Subject: [PATCH 5/8] adding missing file :-) : trace_bench/llm.py --- trace_bench/llm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 trace_bench/llm.py diff --git a/trace_bench/llm.py b/trace_bench/llm.py new file mode 100644 index 0000000..b3f6034 --- /dev/null +++ b/trace_bench/llm.py @@ -0,0 +1,10 @@ +from __future__ import annotations + + +def openai_compatible_model_name(model: str) -> str: + """Return the model identifier expected by OpenAI-compatible clients.""" + if not isinstance(model, str): + raise TypeError("model must be a string.") + if model.startswith("openrouter/"): + return model.split("/", 1)[1] + return model From 4b27bd603c4d35c07f0ba4b5700eb7c976fcd7e9 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Thu, 11 Jun 2026 10:53:02 +0200 Subject: [PATCH 6/8] chore: retrigger CI against latest main From 705a83e0c41d4a74baf04c4f0a280a2bf3521bb3 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Thu, 11 Jun 2026 11:53:37 +0200 Subject: [PATCH 7/8] ci: install HF extra and normalize batch_size alias --- .github/workflows/ci.yml | 2 +- trace_bench/cli.py | 1 + trace_bench/resolve.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb6758b..0d65e95 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install "git+https://github.com/AgentOpt/OpenTrace.git@experimental" - python -m pip install -e . + python -m pip install -e ".[hf]" - name: Validate installation and syntax run: | diff --git a/trace_bench/cli.py b/trace_bench/cli.py index 745aca2..8813326 100644 --- a/trace_bench/cli.py +++ b/trace_bench/cli.py @@ -68,6 +68,7 @@ def _task_in_bench(task_key: str, bench: str | None) -> bool: "num_iters", "num_search_iterations", "train_batch_size", + "batch_size", "merge_every", "pareto_subset_size", "ps_steps", diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py index c173ed5..475b7e0 100644 --- a/trace_bench/resolve.py +++ b/trace_bench/resolve.py @@ -41,6 +41,7 @@ def _param_alias_map(algo_name: str) -> Dict[str, str]: "ps_candidates": "num_candidates", "ps_proposals": "num_proposals", "ps_mem_update": "memory_update_frequency", + "batch_size": "train_batch_size", } ) if algo_name in _GEPA_TRAINERS: From 93ec869c9b66985c6659f26ad0d45a48ea965f05 Mon Sep 17 00:00:00 2001 From: doxav <> Date: Thu, 11 Jun 2026 14:02:19 +0200 Subject: [PATCH 8/8] fix: declare nest-asyncio dependency for OpenEvolve trainer --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a808cda..769805e 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ "tensorboardX", "tensorboard", "pyyaml", + "nest-asyncio>=1.6.0", ] # Optional dependencies for external trainers in trace_bench/trainers/.