diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb6758b..0d65e95 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install "git+https://github.com/AgentOpt/OpenTrace.git@experimental" - python -m pip install -e . + python -m pip install -e ".[hf]" - name: Validate installation and syntax run: | diff --git a/README.md b/README.md index 40e3aae..3cf59bc 100644 --- a/README.md +++ b/README.md @@ -128,3 +128,9 @@ PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q ## License MIT + +## External Trainers + +- `DSPyTrainer` (`trace_bench/trainers/dspy_trainer.py`) +- `TextGradTrainer` (`trace_bench/trainers/textgrad_trainer.py`) +- `OpenEvolveTrainer` (`trace_bench/trainers/openevolve_trainer.py`) diff --git a/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb new file mode 100644 index 0000000..fb09a9c --- /dev/null +++ b/notebooks/09_textgrad_openevolve_evaluation_notebook.ipynb @@ -0,0 +1,2058 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6fa48e6e", + "metadata": {}, + "source": [ + "# Trainer comparison notebook\n", + "\n", + "This notebook validates and compares four real trainer paths:\n", + "\n", + "- `PrioritySearch` as the standard Trace baseline\n", + "- `TextGradTrainer`\n", + "- `OpenEvolveTrainer`\n", + "- `DSPyTrainer`\n", + "\n", + "It checks out `textgrad_openevolve`, installs real optional packages when needed, runs focused integration checks, and runs a small real optimization demo with OpenRouter or OpenAI. The DSPy row uses a DSPy-native task; the Trace/TextGrad/OpenEvolve rows use a Trace scalar task." + ] + }, + { + "cell_type": "markdown", + "id": "0f51598c", + "metadata": {}, + "source": [ + "## What this notebook verifies\n", + "\n", + "- required trainer packages import from real installations\n", + "- Trace-Bench discovers the trainer classes\n", + "- focused trainer tests and compile checks pass\n", + "- every comparison row learns from three examples and reports three held-out examples\n", + "- result tables show before/after scores, per-example outputs, and red highlighting for rows with no held-out improvement" + ] + }, + { + "cell_type": "markdown", + "id": "b4ae0512", + "metadata": {}, + "source": [ + "## High-level interpretation guide\n", + "\n", + "Use the notebook in this order:\n", + "\n", + "1. Confirm the branch, package imports, trainer discovery, and focused tests.\n", + "2. Compare before/after train and held-out scores.\n", + "3. Treat any red row as a real trainer run that completed or failed without held-out improvement." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "56d885a1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:11.307336Z", + "iopub.status.busy": "2026-06-02T16:40:11.307259Z", + "iopub.status.idle": "2026-06-02T16:40:11.311966Z", + "shell.execute_reply": "2026-06-02T16:40:11.311533Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WORKDIR = /home/xav/code/Trace-Bench\n", + "TRACE_BENCH_REMOTE_URL = https://github.com/doxav/Trace-Bench.git\n", + "TRACE_BENCH_BRANCH = textgrad_openevolve\n", + "TRACE_BENCH_REPO = /home/xav/code/Trace-Bench\n", + "NEWTRACE_REMOTE_URL = https://github.com/doxav/NewTrace.git\n", + "NEWTRACE_BRANCH = experimental\n", + "NEWTRACE_REPO = /home/xav/code/Trace-Bench/NewTrace\n", + "OPENEVOLVE_REMOTE_URL = https://github.com/algorithmicsuperintelligence/openevolve.git\n", + "OPENEVOLVE_BRANCH = main\n", + "OPENEVOLVE_REPO = /home/xav/code/Trace-Bench/openevolve\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import subprocess\n", + "from collections.abc import Sequence\n", + "from pathlib import Path\n", + "from subprocess import CompletedProcess\n", + "\n", + "WORKDIR = Path(\"/content\") if Path(\"/content\").exists() else Path.cwd()\n", + "CURRENT_REPO = Path.cwd()\n", + "TRACE_BENCH_REMOTE_URL = \"https://github.com/doxav/Trace-Bench.git\"\n", + "TRACE_BENCH_BRANCH = \"textgrad_openevolve\"\n", + "TRACE_BENCH_REPO = CURRENT_REPO if (CURRENT_REPO / \"trace_bench\").is_dir() else WORKDIR / \"Trace-Bench\"\n", + "NEWTRACE_REMOTE_URL = \"https://github.com/doxav/NewTrace.git\"\n", + "NEWTRACE_BRANCH = \"experimental\"\n", + "NEWTRACE_REPO = WORKDIR / \"NewTrace\"\n", + "OPENEVOLVE_REMOTE_URL = \"https://github.com/algorithmicsuperintelligence/openevolve.git\"\n", + "OPENEVOLVE_BRANCH = \"main\"\n", + "OPENEVOLVE_REPO = WORKDIR / \"openevolve\"\n", + "\n", + "for repo_path in (NEWTRACE_REPO, TRACE_BENCH_REPO):\n", + " repo_path_str = str(repo_path)\n", + " if repo_path_str not in sys.path:\n", + " sys.path.insert(0, repo_path_str)\n", + "\n", + "def run(cmd: Sequence[str | os.PathLike[str]], cwd: Path | str | None = None, check: bool = True) -> CompletedProcess[bytes]:\n", + " \"\"\"Run a subprocess command and echo its argv without shell interpolation.\"\"\"\n", + " print(\"$\", \" \".join(map(str, cmd)))\n", + " return subprocess.run([str(part) for part in cmd], cwd=cwd, check=check)\n", + "\n", + "def checkout_branch(repo_path: Path, remote_url: str, branch: str) -> None:\n", + " \"\"\"Fetch, checkout, and fast-forward a branch in an existing clone.\"\"\"\n", + " run([\"git\", \"fetch\", remote_url, branch], cwd=repo_path)\n", + " checkout = run([\"git\", \"checkout\", branch], cwd=repo_path, check=False)\n", + " if checkout.returncode != 0:\n", + " run([\"git\", \"checkout\", \"-b\", branch, \"FETCH_HEAD\"], cwd=repo_path)\n", + " run([\"git\", \"pull\", \"--ff-only\", remote_url, branch], cwd=repo_path)\n", + "\n", + "print(\"WORKDIR =\", WORKDIR)\n", + "print(\"TRACE_BENCH_REMOTE_URL =\", TRACE_BENCH_REMOTE_URL)\n", + "print(\"TRACE_BENCH_BRANCH =\", TRACE_BENCH_BRANCH)\n", + "print(\"TRACE_BENCH_REPO =\", TRACE_BENCH_REPO)\n", + "print(\"NEWTRACE_REMOTE_URL =\", NEWTRACE_REMOTE_URL)\n", + "print(\"NEWTRACE_BRANCH =\", NEWTRACE_BRANCH)\n", + "print(\"NEWTRACE_REPO =\", NEWTRACE_REPO)\n", + "print(\"OPENEVOLVE_REMOTE_URL =\", OPENEVOLVE_REMOTE_URL)\n", + "print(\"OPENEVOLVE_BRANCH =\", OPENEVOLVE_BRANCH)\n", + "print(\"OPENEVOLVE_REPO =\", OPENEVOLVE_REPO)" + ] + }, + { + "cell_type": "markdown", + "id": "6d7c51fb", + "metadata": {}, + "source": [ + "## 1. Clone and checkout the repositories\n", + "\n", + "This clones:\n", + "- `Trace-Bench` on `textgrad_openevolve`\n", + "- `doxav/NewTrace` on `experimental`\n", + "- `OpenEvolve` only if the real package is missing\n", + "\n", + "Skip this if you already have local checkouts and want to point the notebook at them manually." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b6ca8593", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:11.313317Z", + "iopub.status.busy": "2026-06-02T16:40:11.313253Z", + "iopub.status.idle": "2026-06-02T16:40:13.710562Z", + "shell.execute_reply": "2026-06-02T16:40:13.710030Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trace-Bench current checkout is already on textgrad_openevolve; preserving local edits.\n", + "$ git clone --branch experimental --single-branch https://github.com/doxav/NewTrace.git /home/xav/code/Trace-Bench/NewTrace\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cloning into '/home/xav/code/Trace-Bench/NewTrace'...\n" + ] + } + ], + "source": [ + "if not TRACE_BENCH_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", TRACE_BENCH_BRANCH,\n", + " \"--single-branch\",\n", + " TRACE_BENCH_REMOTE_URL,\n", + " str(TRACE_BENCH_REPO),\n", + " ])\n", + "elif TRACE_BENCH_REPO.resolve() == CURRENT_REPO.resolve():\n", + " branch = subprocess.check_output([\"git\", \"branch\", \"--show-current\"], cwd=TRACE_BENCH_REPO, text=True).strip()\n", + " if branch != TRACE_BENCH_BRANCH:\n", + " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n", + " else:\n", + " print(f\"Trace-Bench current checkout is already on {TRACE_BENCH_BRANCH}; preserving local edits.\")\n", + "else:\n", + " print(f\"Trace-Bench already exists; checking out {TRACE_BENCH_BRANCH}.\")\n", + " checkout_branch(TRACE_BENCH_REPO, TRACE_BENCH_REMOTE_URL, TRACE_BENCH_BRANCH)\n", + "\n", + "if not NEWTRACE_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", NEWTRACE_BRANCH,\n", + " \"--single-branch\",\n", + " NEWTRACE_REMOTE_URL,\n", + " str(NEWTRACE_REPO),\n", + " ])\n", + "else:\n", + " print(f\"NewTrace already exists; checking out {NEWTRACE_BRANCH}.\")\n", + " checkout_branch(NEWTRACE_REPO, NEWTRACE_REMOTE_URL, NEWTRACE_BRANCH)" + ] + }, + { + "cell_type": "markdown", + "id": "963c01d5", + "metadata": {}, + "source": [ + "## 2. Install Python dependencies\n", + "\n", + "This installs NewTrace and Trace-Bench editable, plus the real optional trainer packages used by the demo. If `openevolve.run_evolution` is not importable, OpenEvolve is cloned and installed editable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fbae758b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:13.711999Z", + "iopub.status.busy": "2026-06-02T16:40:13.711923Z", + "iopub.status.idle": "2026-06-02T16:40:20.589979Z", + "shell.execute_reply": "2026-06-02T16:40:20.589514Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -U pip setuptools wheel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q graphviz pyyaml pytest litellm aiohttp nest_asyncio dspy-ai optuna tensorboard tensorboardX scikit-learn datasets openai pandas\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench/NewTrace\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pip install -q -e /home/xav/code/Trace-Bench\n" + ] + } + ], + "source": [ + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"pip\", \"setuptools\", \"wheel\"])\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\",\n", + " \"graphviz\", \"pyyaml\", \"pytest\", \"litellm\", \"aiohttp\", \"nest_asyncio\", \"dspy-ai\", \"optuna\",\n", + " \"tensorboard\", \"tensorboardX\", \"scikit-learn\", \"datasets\", \"openai\", \"pandas\"])\n", + "\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(NEWTRACE_REPO)])\n", + "run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(TRACE_BENCH_REPO)])\n", + "\n", + "def has_real_openevolve() -> bool:\n", + " \"\"\"Return True only when the real OpenEvolve API is importable.\"\"\"\n", + " try:\n", + " import openevolve\n", + " return callable(getattr(openevolve, \"run_evolution\", None))\n", + " except Exception:\n", + " return False\n", + "\n", + "if not has_real_openevolve():\n", + " if not OPENEVOLVE_REPO.exists():\n", + " run([\n", + " \"git\", \"clone\",\n", + " \"--branch\", OPENEVOLVE_BRANCH,\n", + " \"--single-branch\",\n", + " OPENEVOLVE_REMOTE_URL,\n", + " str(OPENEVOLVE_REPO),\n", + " ])\n", + " else:\n", + " print(f\"OpenEvolve already exists; checking out {OPENEVOLVE_BRANCH}.\")\n", + " checkout_branch(OPENEVOLVE_REPO, OPENEVOLVE_REMOTE_URL, OPENEVOLVE_BRANCH)\n", + " run([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(OPENEVOLVE_REPO)])\n", + "\n", + "if not has_real_openevolve():\n", + " raise ImportError(\"OpenEvolve is required for this demo and could not be installed.\")" + ] + }, + { + "cell_type": "markdown", + "id": "25fd9e44", + "metadata": {}, + "source": [ + "## 3. Provider setup for real online experiments\n", + "\n", + "The comparison requires a real provider. In Colab the cell reads `OPENROUTER_API_KEY` or `OPENAI_API_KEY` from Colab Secrets when present; locally it reads the same environment variables." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b984ab0", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:20.591586Z", + "iopub.status.busy": "2026-06-02T16:40:20.591465Z", + "iopub.status.idle": "2026-06-02T16:40:20.595509Z", + "shell.execute_reply": "2026-06-02T16:40:20.595070Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROVIDER = openrouter\n", + "TRACE_LITELLM_MODEL = openrouter/openai/gpt-4o-mini\n", + "OPENAI_BASE_URL = https://openrouter.ai/api/v1\n", + "OPENROUTER_API_KEY configured = True\n" + ] + } + ], + "source": [ + "from getpass import getpass\n", + "\n", + "def colab_secret(name: str) -> str:\n", + " \"\"\"Return a Colab Secret value when available, otherwise an empty string.\"\"\"\n", + " try:\n", + " from google.colab import userdata\n", + " except Exception:\n", + " return \"\"\n", + " try:\n", + " return userdata.get(name) or \"\"\n", + " except Exception:\n", + " return \"\"\n", + "\n", + "PROVIDER = \"auto\" # @param [\"auto\", \"openrouter\", \"openai\", \"none\"]\n", + "MODEL = \"\" # @param {type:\"string\"}\n", + "\n", + "openrouter_key = os.environ.get(\"OPENROUTER_API_KEY\") or colab_secret(\"OPENROUTER_API_KEY\")\n", + "openai_key = os.environ.get(\"OPENAI_API_KEY\") or colab_secret(\"OPENAI_API_KEY\")\n", + "MODEL = MODEL or os.environ.get(\"TRACE_LITELLM_MODEL\") or colab_secret(\"TRACE_LITELLM_MODEL\")\n", + "\n", + "if PROVIDER == \"auto\":\n", + " active_provider = \"openrouter\" if openrouter_key else \"openai\" if openai_key else \"none\"\n", + "else:\n", + " active_provider = PROVIDER\n", + "\n", + "if active_provider == \"openrouter\":\n", + " if not MODEL:\n", + " MODEL = \"openrouter/openai/gpt-4o-mini\"\n", + " if not openrouter_key:\n", + " openrouter_key = getpass(\"OPENROUTER_API_KEY: \")\n", + " if not openrouter_key:\n", + " raise ValueError(\"OPENROUTER_API_KEY is required when PROVIDER is openrouter.\")\n", + " os.environ[\"OPENROUTER_API_KEY\"] = openrouter_key\n", + " os.environ[\"OPENAI_API_KEY\"] = openrouter_key\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", + "elif active_provider == \"openai\":\n", + " if not MODEL:\n", + " MODEL = \"gpt-4o-mini\"\n", + " if not openai_key:\n", + " openai_key = getpass(\"OPENAI_API_KEY: \")\n", + " if not openai_key:\n", + " raise ValueError(\"OPENAI_API_KEY is required when PROVIDER is openai.\")\n", + " os.environ[\"OPENAI_API_KEY\"] = openai_key\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", + "elif active_provider == \"none\":\n", + " print(\"Skipping online provider configuration.\")\n", + "else:\n", + " raise ValueError(f\"Unsupported PROVIDER: {PROVIDER}\")\n", + "\n", + "print(\"PROVIDER =\", active_provider)\n", + "print(\"TRACE_LITELLM_MODEL =\", os.environ.get(\"TRACE_LITELLM_MODEL\"))\n", + "print(\"OPENAI_BASE_URL =\", os.environ.get(\"OPENAI_BASE_URL\"))\n", + "print(\"OPENROUTER_API_KEY configured =\", bool(os.environ.get(\"OPENROUTER_API_KEY\")))" + ] + }, + { + "cell_type": "markdown", + "id": "1a574c62", + "metadata": {}, + "source": [ + "## 4. Sanity checks and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3b4768bb", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:20.596914Z", + "iopub.status.busy": "2026-06-02T16:40:20.596844Z", + "iopub.status.idle": "2026-06-02T16:40:25.850826Z", + "shell.execute_reply": "2026-06-02T16:40:25.850296Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK: opto.optimizers.textgrad\n", + "OK: openevolve\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK: dspy\n", + "OK: trace_bench\n", + "OK: trace_bench.runner\n", + "OK: trace_bench.registry\n", + "OK: trace_bench.config\n", + "OK: trace_bench.trainers.textgrad_trainer\n", + "OK: trace_bench.trainers.openevolve_trainer\n", + "OK: trace_bench.trainers.dspy_trainer\n", + "TextGrad module: /home/xav/code/Trace-Bench/NewTrace/opto/optimizers/textgrad.py\n", + "OpenEvolve module: /home/xav/miniconda3/lib/python3.13/site-packages/openevolve/__init__.py\n", + "DSPy module: /home/xav/miniconda3/lib/python3.13/site-packages/dspy/__init__.py\n" + ] + } + ], + "source": [ + "import importlib\n", + "import pandas as pd\n", + "\n", + "def required_import(name: str) -> object:\n", + " \"\"\"Import a required module and raise a descriptive error when unavailable.\"\"\"\n", + " try:\n", + " module = importlib.import_module(name)\n", + " print(\"OK:\", name)\n", + " return module\n", + " except Exception as exc:\n", + " raise ImportError(f\"Required module is unavailable: {name}\") from exc\n", + "\n", + "textgrad_module = required_import(\"opto.optimizers.textgrad\")\n", + "openevolve_module = required_import(\"openevolve\")\n", + "dspy_module = required_import(\"dspy\")\n", + "required_import(\"trace_bench\")\n", + "required_import(\"trace_bench.runner\")\n", + "required_import(\"trace_bench.registry\")\n", + "required_import(\"trace_bench.config\")\n", + "required_import(\"trace_bench.trainers.textgrad_trainer\")\n", + "required_import(\"trace_bench.trainers.openevolve_trainer\")\n", + "required_import(\"trace_bench.trainers.dspy_trainer\")\n", + "\n", + "if not callable(getattr(textgrad_module, \"TextGrad\", None)):\n", + " raise ImportError(\"opto.optimizers.textgrad.TextGrad is required for this demo.\")\n", + "if not callable(getattr(openevolve_module, \"run_evolution\", None)):\n", + " raise ImportError(\"openevolve.run_evolution is required for this demo.\")\n", + "if not callable(getattr(dspy_module, \"LM\", None)):\n", + " raise ImportError(\"dspy.LM is required for this demo.\")\n", + "\n", + "print(\"TextGrad module:\", getattr(textgrad_module, \"__file__\", \"unknown\"))\n", + "print(\"OpenEvolve module:\", getattr(openevolve_module, \"__file__\", \"unknown\"))\n", + "print(\"DSPy module:\", getattr(dspy_module, \"__file__\", \"unknown\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4e82660b", + "metadata": {}, + "source": [ + "## 5. Focused validation commands\n", + "\n", + "These are the most relevant tests for the new trainers and their integration surface." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af508c08", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:25.852675Z", + "iopub.status.busy": "2026-06-02T16:40:25.852596Z", + "iopub.status.idle": "2026-06-02T16:40:29.170929Z", + "shell.execute_reply": "2026-06-02T16:40:29.170457Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m pytest tests/test_resolve_external_trainers.py tests/test_external_utils.py tests/test_llm_utils.py tests/test_textgrad_trainer.py tests/test_openevolve_trainer.py tests/test_dspy_trainer.py -q\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "................" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". [100%]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "17 passed in 2.54s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$ /home/xav/miniconda3/bin/python -m py_compile trace_bench/resolve.py trace_bench/cli.py trace_bench/runner.py trace_bench/llm.py trace_bench/trainers/_external_utils.py trace_bench/trainers/textgrad_trainer.py trace_bench/trainers/openevolve_trainer.py trace_bench/trainers/dspy_trainer.py\n" + ] + }, + { + "data": { + "text/plain": [ + "CompletedProcess(args=['/home/xav/miniconda3/bin/python', '-m', 'py_compile', 'trace_bench/resolve.py', 'trace_bench/cli.py', 'trace_bench/runner.py', 'trace_bench/llm.py', 'trace_bench/trainers/_external_utils.py', 'trace_bench/trainers/textgrad_trainer.py', 'trace_bench/trainers/openevolve_trainer.py', 'trace_bench/trainers/dspy_trainer.py'], returncode=0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TARGETED_TESTS = [\n", + " \"tests/test_resolve_external_trainers.py\",\n", + " \"tests/test_external_utils.py\",\n", + " \"tests/test_llm_utils.py\",\n", + " \"tests/test_textgrad_trainer.py\",\n", + " \"tests/test_openevolve_trainer.py\",\n", + " \"tests/test_dspy_trainer.py\",\n", + "]\n", + "\n", + "run([sys.executable, \"-m\", \"pytest\", *TARGETED_TESTS, \"-q\"], cwd=TRACE_BENCH_REPO)\n", + "run([sys.executable, \"-m\", \"py_compile\",\n", + " \"trace_bench/resolve.py\",\n", + " \"trace_bench/cli.py\",\n", + " \"trace_bench/runner.py\",\n", + " \"trace_bench/llm.py\",\n", + " \"trace_bench/trainers/_external_utils.py\",\n", + " \"trace_bench/trainers/textgrad_trainer.py\",\n", + " \"trace_bench/trainers/openevolve_trainer.py\",\n", + " \"trace_bench/trainers/dspy_trainer.py\"], cwd=TRACE_BENCH_REPO)" + ] + }, + { + "cell_type": "markdown", + "id": "3442bf98", + "metadata": {}, + "source": [ + "## 6. Trainer discovery and signatures\n", + "\n", + "This is the fastest way to see whether the branch contains the trainer code and wires it into discovery." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c182738c", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:29.172741Z", + "iopub.status.busy": "2026-06-02T16:40:29.172667Z", + "iopub.status.idle": "2026-06-02T16:40:29.246810Z", + "shell.execute_reply": "2026-06-02T16:40:29.246111Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainer_idavailablesourceresolved_typeresolved_nameuses_trace_optimizerframework
0DSPyTrainerTruetrace_bench.trainers.dspy_trainerclassDSPyTrainerFalsedspy
1OpenEvolveTrainerTruetrace_bench.trainers.openevolve_trainerclassOpenEvolveTrainerFalseNaN
2PrioritySearchTrueopto.features.priority_search.priority_searchstrPrioritySearchNoneNaN
3TextGradTrainerTruetrace_bench.trainers.textgrad_trainerclassTextGradTrainerFalseNaN
\n", + "
" + ], + "text/plain": [ + " trainer_id available \\\n", + "0 DSPyTrainer True \n", + "1 OpenEvolveTrainer True \n", + "2 PrioritySearch True \n", + "3 TextGradTrainer True \n", + "\n", + " source resolved_type \\\n", + "0 trace_bench.trainers.dspy_trainer class \n", + "1 trace_bench.trainers.openevolve_trainer class \n", + "2 opto.features.priority_search.priority_search str \n", + "3 trace_bench.trainers.textgrad_trainer class \n", + "\n", + " resolved_name uses_trace_optimizer framework \n", + "0 DSPyTrainer False dspy \n", + "1 OpenEvolveTrainer False NaN \n", + "2 PrioritySearch None NaN \n", + "3 TextGradTrainer False NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from trace_bench.registry import discover_trainers\n", + "from trace_bench.runner import _resolve_algorithm\n", + "\n", + "trainer_rows = []\n", + "for spec in discover_trainers():\n", + " if spec.id in {\"PrioritySearch\", \"TextGradTrainer\", \"OpenEvolveTrainer\", \"DSPyTrainer\"}:\n", + " resolved = _resolve_algorithm(spec.id)\n", + " trainer_rows.append({\n", + " \"trainer_id\": spec.id,\n", + " \"available\": spec.available,\n", + " \"source\": spec.source,\n", + " \"resolved_type\": type(resolved).__name__ if not isinstance(resolved, type) else \"class\",\n", + " \"resolved_name\": getattr(resolved, \"__name__\", str(resolved)),\n", + " \"uses_trace_optimizer\": getattr(resolved, \"USES_TRACE_OPTIMIZER\", None) if isinstance(resolved, type) else None,\n", + " \"framework\": getattr(resolved, \"FRAMEWORK\", None) if isinstance(resolved, type) else None,\n", + " })\n", + "\n", + "pd.DataFrame(trainer_rows).sort_values(\"trainer_id\").reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "id": "118fce8b", + "metadata": {}, + "source": [ + "## 7. Shared helpers for real train/test optimization\n", + "\n", + "The Trace, TextGrad, and OpenEvolve rows use the same Trace scalar parameter task. The DSPy row uses a small routing-code task where MIPROv2 can optimize instructions from labeled examples within notebook runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8eddf6c9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:29.248580Z", + "iopub.status.busy": "2026-06-02T16:40:29.248504Z", + "iopub.status.idle": "2026-06-02T16:40:29.259545Z", + "shell.execute_reply": "2026-06-02T16:40:29.259148Z" + } + }, + "outputs": [], + "source": [ + "import contextlib\n", + "import io\n", + "import re\n", + "from typing import Any, Callable\n", + "\n", + "import dspy\n", + "from IPython.display import HTML, display\n", + "\n", + "from trace_bench.config import TrainerConfig\n", + "from trace_bench.registry import load_task_bundle\n", + "from trace_bench.runner import _train_bundle\n", + "from trace_bench.trainers._external_utils import apply_parameter_updates\n", + "\n", + "TRACE_TASK_ID = \"trace_examples:opentrace_train_single_node\"\n", + "TASKS_ROOT = str(TRACE_BENCH_REPO / \"LLM4AD\" / \"benchmark_tasks\")\n", + "TRACE_INITIAL_VALUE = 0.0\n", + "TRACE_TARGET_VALUE = 3.0\n", + "TRACE_TRAIN_DATASET = {\"inputs\": [\"train-a\", \"train-b\", \"train-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n", + "TRACE_TEST_DATASET = {\"inputs\": [\"test-a\", \"test-b\", \"test-c\"], \"infos\": [TRACE_TARGET_VALUE] * 3}\n", + "DSPY_TRAIN_DATASET = {\n", + " \"inputs\": [\"customer tier scarlet\", \"customer tier azure\", \"customer tier emerald\"],\n", + " \"infos\": [\"A\", \"B\", \"C\"],\n", + "}\n", + "DSPY_TEST_DATASET = {\n", + " \"inputs\": [\"routing code for scarlet ticket\", \"routing code for azure ticket\", \"routing code for emerald ticket\"],\n", + " \"infos\": [\"A\", \"B\", \"C\"],\n", + "}\n", + "\n", + "class RoutingDSPySignature(dspy.Signature):\n", + " \"\"\"Return the requested routing code as a single uppercase letter.\"\"\"\n", + " ticket: str = dspy.InputField()\n", + " answer: str = dspy.OutputField(desc=\"single uppercase letter\")\n", + "\n", + "class RoutingDSPyAgent(dspy.Module):\n", + " \"\"\"Small DSPy module optimized through DSPyTrainer.\"\"\"\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " self.predict = dspy.Predict(RoutingDSPySignature)\n", + "\n", + " def forward(self, ticket: str) -> str:\n", + " return self.predict(ticket=ticket).answer\n", + "\n", + " @classmethod\n", + " def to_examples(cls, inputs: list[Any], infos: list[Any]) -> list[Any]:\n", + " return [\n", + " dspy.Example(ticket=str(ticket), answer=str(code), _task=ticket, _info=code).with_inputs(\"ticket\")\n", + " for ticket, code in zip(inputs, infos)\n", + " ]\n", + "\n", + "class RoutingDSPyGuide:\n", + " \"\"\"Exact-match routing-code metric for DSPy optimizers.\"\"\"\n", + " def get_feedback(self, _query: Any, response: Any, reference: Any, **_kwargs: Any) -> tuple[float, str]:\n", + " text = str(getattr(response, \"data\", response)).strip().upper()\n", + " match = re.search(r\"\b[A-Z]\b\", text)\n", + " prediction = match.group(0) if match else text[:1]\n", + " target = str(reference).strip().upper()\n", + " score = 1.0 if prediction == target else 0.0\n", + " return score, f\"expected={target}; response={text}\"\n", + "\n", + " def __call__(self, query: Any, response: Any, reference: Any, **kwargs: Any) -> tuple[float, str]:\n", + " return self.get_feedback(query, response, reference, **kwargs)\n", + "\n", + "def _set_only_scalar_trainable(bundle: dict[str, Any]) -> None:\n", + " param = bundle[\"param\"]\n", + " scalar = getattr(param, \"value\", None)\n", + " if scalar is None:\n", + " scalar = getattr(param, \"guess\", None)\n", + " if scalar is None:\n", + " raise AttributeError(\"Scalar demo task requires param.value or param.guess.\")\n", + " for parameter in param.parameters():\n", + " parameter.trainable = parameter is scalar\n", + " apply_parameter_updates({scalar: TRACE_INITIAL_VALUE})\n", + "\n", + "def make_trace_demo_bundle() -> dict[str, Any]:\n", + " bundle = load_task_bundle(TRACE_TASK_ID, TASKS_ROOT)\n", + " _set_only_scalar_trainable(bundle)\n", + " bundle[\"train_dataset\"] = TRACE_TRAIN_DATASET\n", + " bundle[\"test_dataset\"] = TRACE_TEST_DATASET\n", + " bundle.pop(\"validate_dataset\", None)\n", + " bundle[\"optimizer_kwargs\"][\"objective\"] = f\"Set the trainable scalar to exactly {TRACE_TARGET_VALUE}.\"\n", + " bundle[\"metadata\"][\"task_label\"] = \"Trace scalar\"\n", + " return bundle\n", + "\n", + "def make_dspy_lm(max_tokens: int = 200) -> Any:\n", + " model = os.environ.get(\"TRACE_LITELLM_MODEL\") or \"gpt-4o-mini\"\n", + " if \"/\" not in model and (\"gpt\" in model.lower() or model.lower().startswith(\"o\")):\n", + " model = f\"openai/{model}\"\n", + " lm_kwargs: dict[str, Any] = {\"cache\": False, \"max_tokens\": max_tokens}\n", + " api_base = os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\")\n", + " if api_base:\n", + " lm_kwargs[\"api_base\"] = api_base\n", + " return dspy.LM(model=model, **lm_kwargs)\n", + "\n", + "def make_dspy_demo_bundle() -> dict[str, Any]:\n", + " dspy.configure(lm=make_dspy_lm())\n", + " return {\n", + " \"param\": RoutingDSPyAgent(),\n", + " \"guide\": RoutingDSPyGuide(),\n", + " \"train_dataset\": DSPY_TRAIN_DATASET,\n", + " \"test_dataset\": DSPY_TEST_DATASET,\n", + " \"optimizer_kwargs\": {},\n", + " \"metadata\": {\"task_label\": \"DSPy routing code\", \"framework\": \"dspy\"},\n", + " }\n", + "\n", + "def short_text(value: Any, limit: int = 100) -> str:\n", + " text = str(value)\n", + " return text if len(text) <= limit else text[: limit - 3] + \"...\"\n", + "\n", + "def snapshot_trainable_value(bundle: dict[str, Any]) -> Any:\n", + " scalar = getattr(bundle[\"param\"], \"value\", None)\n", + " if scalar is None:\n", + " scalar = getattr(bundle[\"param\"], \"guess\", None)\n", + " if scalar is not None:\n", + " return getattr(scalar, \"data\", None)\n", + " predictor = getattr(bundle[\"param\"], \"predict\", None)\n", + " signature = getattr(predictor, \"signature\", None)\n", + " return short_text(getattr(signature, \"instructions\", type(bundle[\"param\"]).__name__))\n", + "\n", + "def task_label(bundle: dict[str, Any]) -> str:\n", + " metadata = bundle.get(\"metadata\", {})\n", + " return str(metadata.get(\"task_label\") or metadata.get(\"benchmark\") or \"demo\")\n", + "\n", + "def output_value(output: Any) -> Any:\n", + " return short_text(getattr(output, \"data\", output), limit=140)\n", + "\n", + "def score_guide(guide: Any, task_input: Any, response: Any, task_info: Any) -> tuple[float, str]:\n", + " score, feedback = guide(task_input, response, task_info) if callable(guide) else guide.get_feedback(task_input, response, task_info)\n", + " return float(score), str(feedback)\n", + "\n", + "def score_dataset(bundle: dict[str, Any], dataset: dict[str, list[Any]]) -> dict[str, Any]:\n", + " inputs = dataset.get(\"inputs\") or []\n", + " infos = dataset.get(\"infos\") or dataset.get(\"info\") or []\n", + " if len(inputs) != len(infos):\n", + " raise ValueError(\"Dataset 'inputs' and 'infos' must have the same length.\")\n", + " if not inputs:\n", + " raise ValueError(\"Dataset must contain at least one example.\")\n", + " rows = []\n", + " scores = []\n", + " for task_input, task_info in zip(inputs, infos):\n", + " response = output_value(bundle[\"param\"](task_input))\n", + " score, feedback = score_guide(bundle[\"guide\"], task_input, response, task_info)\n", + " scores.append(score)\n", + " rows.append({\"input\": task_input, \"expected\": task_info, \"output\": response, \"score\": score, \"feedback\": feedback})\n", + " return {\"mean_score\": sum(scores) / len(scores), \"rows\": rows}\n", + "\n", + "def run_train_bundle(\n", + " trainer_id: str,\n", + " params: dict[str, Any] | None = None,\n", + " mode: str = \"real\",\n", + " logger: str = \"none\",\n", + " bundle_factory: Callable[[], dict[str, Any]] = make_trace_demo_bundle,\n", + ") -> dict[str, Any]:\n", + " bundle = bundle_factory()\n", + " params = params or {}\n", + " train_dataset = bundle[\"train_dataset\"]\n", + " test_dataset = bundle.get(\"test_dataset\") or train_dataset\n", + " before = {\n", + " \"value\": snapshot_trainable_value(bundle),\n", + " \"train\": score_dataset(bundle, train_dataset),\n", + " \"test\": score_dataset(bundle, test_dataset),\n", + " }\n", + " result = _train_bundle(\n", + " bundle=bundle,\n", + " trainer_spec=TrainerConfig(id=trainer_id, params_variants=[params], logger=logger),\n", + " params=params,\n", + " mode=mode,\n", + " )\n", + " after = {\n", + " \"value\": snapshot_trainable_value(bundle),\n", + " \"train\": score_dataset(bundle, train_dataset),\n", + " \"test\": score_dataset(bundle, test_dataset),\n", + " }\n", + " return {\n", + " \"trainer_id\": trainer_id,\n", + " \"task\": task_label(bundle),\n", + " \"mode\": mode,\n", + " \"result\": result,\n", + " \"before\": before,\n", + " \"after\": after,\n", + " \"train_examples\": len(train_dataset[\"inputs\"]),\n", + " \"test_examples\": len(test_dataset[\"inputs\"]),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "33501896", + "metadata": {}, + "source": [ + "## 8. Real train/test optimization runs\n", + "\n", + "These runs use the real Trace-Bench trainer entry points and real installed trainer packages. Optimizer logs are captured so the notebook output stays focused on the comparison tables." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f62a8443", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:40:29.261316Z", + "iopub.status.busy": "2026-06-02T16:40:29.261235Z", + "iopub.status.idle": "2026-06-02T16:44:58.092383Z", + "shell.execute_reply": "2026-06-02T16:44:58.091779Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PrioritySearch: ok (OptoPrimeV2)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TextGradTrainer: ok (opto.optimizers.textgrad.TextGrad)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenEvolveTrainer: ok (openevolve.run_evolution)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DSPyTrainer: ok (dspy.MIPROv2)\n", + "Completed 4 real trainer runs.\n" + ] + } + ], + "source": [ + "if active_provider == \"none\":\n", + " raise RuntimeError(\"Real comparison requires OPENROUTER_API_KEY or OPENAI_API_KEY.\")\n", + "\n", + "REAL_TRAINERS = [\n", + " (\"PrioritySearch\", \"Trace scalar\", {\n", + " \"ps_steps\": 2,\n", + " \"ps_batches\": 1,\n", + " \"num_candidates\": 3,\n", + " \"num_proposals\": 2,\n", + " }, make_trace_demo_bundle),\n", + " (\"TextGradTrainer\", \"Trace scalar\", {\n", + " \"num_epochs\": 2,\n", + " \"batch_size\": 1,\n", + " \"ensure_improvement\": True,\n", + " \"improvement_threshold\": 1e-9,\n", + " \"max_tokens\": 1024,\n", + " }, make_trace_demo_bundle),\n", + " (\"OpenEvolveTrainer\", \"Trace scalar\", {\n", + " \"iterations\": 4,\n", + " \"population_size\": 8,\n", + " \"num_islands\": 1,\n", + " \"seed\": 3,\n", + " \"ensure_improvement\": True,\n", + " \"improvement_threshold\": 1e-9,\n", + " \"verbose\": False,\n", + " \"model\": os.environ.get(\"TRACE_LITELLM_MODEL\"),\n", + " \"api_base\": os.environ.get(\"OPENAI_BASE_URL\") or os.environ.get(\"OPENAI_API_BASE\"),\n", + " \"api_key_env\": \"OPENAI_API_KEY\",\n", + " \"max_tokens\": 2048,\n", + " \"temperature\": 0.4,\n", + " }, make_trace_demo_bundle),\n", + " (\"DSPyTrainer\", \"DSPy routing code\", {\n", + " \"dspy_optimizer\": \"mipro\",\n", + " \"dspy_lm\": make_dspy_lm(),\n", + " \"auto\": None,\n", + " \"num_candidates\": 4,\n", + " \"num_trials\": 5,\n", + " \"max_labeled_demos\": 3,\n", + " \"max_bootstrapped_demos\": 1,\n", + " \"num_threads\": 1,\n", + " \"seed\": 7,\n", + " \"verbose\": False,\n", + " }, make_dspy_demo_bundle),\n", + "]\n", + "\n", + "smoke_results = []\n", + "for trainer_id, task, params, bundle_factory in REAL_TRAINERS:\n", + " captured = io.StringIO()\n", + " try:\n", + " with contextlib.redirect_stdout(captured), contextlib.redirect_stderr(captured):\n", + " item = run_train_bundle(trainer_id, params=params, mode=\"real\", bundle_factory=bundle_factory)\n", + " item[\"captured_log_tail\"] = \"\\n\".join(captured.getvalue().splitlines()[-8:])\n", + " smoke_results.append(item)\n", + " print(f\"{trainer_id}: {item['result'].get('status')} ({item['result'].get('resolved_optimizer')})\")\n", + " except Exception as exc:\n", + " smoke_results.append({\n", + " \"trainer_id\": trainer_id,\n", + " \"task\": task,\n", + " \"mode\": \"real\",\n", + " \"status\": \"error\",\n", + " \"error\": f\"{type(exc).__name__}: {exc}\",\n", + " \"captured_log_tail\": \"\\n\".join(captured.getvalue().splitlines()[-8:]),\n", + " })\n", + " print(f\"{trainer_id}: error\")\n", + "\n", + "print(f\"Completed {len(smoke_results)} real trainer runs.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f4ef5b90", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-02T16:44:58.093951Z", + "iopub.status.busy": "2026-06-02T16:44:58.093874Z", + "iopub.status.idle": "2026-06-02T16:44:58.144952Z", + "shell.execute_reply": "2026-06-02T16:44:58.144396Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 trainer_idtaskmodestatusresolved_optimizerbefore_valueafter_valuetrain_examplestest_examplesbefore_train_scoreafter_train_scoretrain_deltabefore_test_scoreafter_test_scoretest_deltaimprovementerror
0PrioritySearchTrace scalarrealokOptoPrimeV20.0000003.00000033-3.0000.0003.000-3.0000.0003.000YESNone
1TextGradTrainerTrace scalarrealokopto.optimizers.textgrad.TextGrad0.0000003.00000033-3.0000.0003.000-3.0000.0003.000YESNone
2OpenEvolveTrainerTrace scalarrealokopenevolve.run_evolution0.0000001.00000033-3.000-2.0001.000-3.000-2.0001.000YESNone
3DSPyTrainerDSPy routing coderealokdspy.MIPROv2Return the requested routing code as a single uppercase letter.Based on the customer tier mentioned in the ticket (e.g., \"customer tier scarlet\", \"customer tier...330.0001.0001.0000.0001.0001.000YESNone
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
All trainers improved on held-out examples.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainer_idtasksplitphaseexampleinputexpectedoutputscore
0DSPyTrainerDSPy routing codetestafter0routing code for scarlet ticketAA1.0
1DSPyTrainerDSPy routing codetestbefore0routing code for scarlet ticketAS0.0
2DSPyTrainerDSPy routing codetestafter1routing code for azure ticketBB1.0
3DSPyTrainerDSPy routing codetestbefore1routing code for azure ticketBA0.0
4DSPyTrainerDSPy routing codetestafter2routing code for emerald ticketCC1.0
5DSPyTrainerDSPy routing codetestbefore2routing code for emerald ticketCE0.0
6DSPyTrainerDSPy routing codetrainafter0customer tier scarletAA1.0
7DSPyTrainerDSPy routing codetrainbefore0customer tier scarletAS0.0
8DSPyTrainerDSPy routing codetrainafter1customer tier azureBB1.0
9DSPyTrainerDSPy routing codetrainbefore1customer tier azureBA0.0
10DSPyTrainerDSPy routing codetrainafter2customer tier emeraldCC1.0
11DSPyTrainerDSPy routing codetrainbefore2customer tier emeraldCE0.0
12OpenEvolveTrainerTrace scalartestafter0test-a3.01.0-2.0
13OpenEvolveTrainerTrace scalartestbefore0test-a3.00.0-3.0
14PrioritySearchTrace scalartestafter0test-a3.03.0-0.0
15PrioritySearchTrace scalartestbefore0test-a3.00.0-3.0
16TextGradTrainerTrace scalartestafter0test-a3.03.0-0.0
17TextGradTrainerTrace scalartestbefore0test-a3.00.0-3.0
18OpenEvolveTrainerTrace scalartestafter1test-b3.01.0-2.0
19OpenEvolveTrainerTrace scalartestbefore1test-b3.00.0-3.0
20PrioritySearchTrace scalartestafter1test-b3.03.0-0.0
21PrioritySearchTrace scalartestbefore1test-b3.00.0-3.0
22TextGradTrainerTrace scalartestafter1test-b3.03.0-0.0
23TextGradTrainerTrace scalartestbefore1test-b3.00.0-3.0
24OpenEvolveTrainerTrace scalartestafter2test-c3.01.0-2.0
25OpenEvolveTrainerTrace scalartestbefore2test-c3.00.0-3.0
26PrioritySearchTrace scalartestafter2test-c3.03.0-0.0
27PrioritySearchTrace scalartestbefore2test-c3.00.0-3.0
28TextGradTrainerTrace scalartestafter2test-c3.03.0-0.0
29TextGradTrainerTrace scalartestbefore2test-c3.00.0-3.0
30OpenEvolveTrainerTrace scalartrainafter0train-a3.01.0-2.0
31OpenEvolveTrainerTrace scalartrainbefore0train-a3.00.0-3.0
32PrioritySearchTrace scalartrainafter0train-a3.03.0-0.0
33PrioritySearchTrace scalartrainbefore0train-a3.00.0-3.0
34TextGradTrainerTrace scalartrainafter0train-a3.03.0-0.0
35TextGradTrainerTrace scalartrainbefore0train-a3.00.0-3.0
36OpenEvolveTrainerTrace scalartrainafter1train-b3.01.0-2.0
37OpenEvolveTrainerTrace scalartrainbefore1train-b3.00.0-3.0
38PrioritySearchTrace scalartrainafter1train-b3.03.0-0.0
39PrioritySearchTrace scalartrainbefore1train-b3.00.0-3.0
40TextGradTrainerTrace scalartrainafter1train-b3.03.0-0.0
41TextGradTrainerTrace scalartrainbefore1train-b3.00.0-3.0
42OpenEvolveTrainerTrace scalartrainafter2train-c3.01.0-2.0
43OpenEvolveTrainerTrace scalartrainbefore2train-c3.00.0-3.0
44PrioritySearchTrace scalartrainafter2train-c3.03.0-0.0
45PrioritySearchTrace scalartrainbefore2train-c3.00.0-3.0
46TextGradTrainerTrace scalartrainafter2train-c3.03.0-0.0
47TextGradTrainerTrace scalartrainbefore2train-c3.00.0-3.0
\n", + "
" + ], + "text/plain": [ + " trainer_id task split phase example \\\n", + "0 DSPyTrainer DSPy routing code test after 0 \n", + "1 DSPyTrainer DSPy routing code test before 0 \n", + "2 DSPyTrainer DSPy routing code test after 1 \n", + "3 DSPyTrainer DSPy routing code test before 1 \n", + "4 DSPyTrainer DSPy routing code test after 2 \n", + "5 DSPyTrainer DSPy routing code test before 2 \n", + "6 DSPyTrainer DSPy routing code train after 0 \n", + "7 DSPyTrainer DSPy routing code train before 0 \n", + "8 DSPyTrainer DSPy routing code train after 1 \n", + "9 DSPyTrainer DSPy routing code train before 1 \n", + "10 DSPyTrainer DSPy routing code train after 2 \n", + "11 DSPyTrainer DSPy routing code train before 2 \n", + "12 OpenEvolveTrainer Trace scalar test after 0 \n", + "13 OpenEvolveTrainer Trace scalar test before 0 \n", + "14 PrioritySearch Trace scalar test after 0 \n", + "15 PrioritySearch Trace scalar test before 0 \n", + "16 TextGradTrainer Trace scalar test after 0 \n", + "17 TextGradTrainer Trace scalar test before 0 \n", + "18 OpenEvolveTrainer Trace scalar test after 1 \n", + "19 OpenEvolveTrainer Trace scalar test before 1 \n", + "20 PrioritySearch Trace scalar test after 1 \n", + "21 PrioritySearch Trace scalar test before 1 \n", + "22 TextGradTrainer Trace scalar test after 1 \n", + "23 TextGradTrainer Trace scalar test before 1 \n", + "24 OpenEvolveTrainer Trace scalar test after 2 \n", + "25 OpenEvolveTrainer Trace scalar test before 2 \n", + "26 PrioritySearch Trace scalar test after 2 \n", + "27 PrioritySearch Trace scalar test before 2 \n", + "28 TextGradTrainer Trace scalar test after 2 \n", + "29 TextGradTrainer Trace scalar test before 2 \n", + "30 OpenEvolveTrainer Trace scalar train after 0 \n", + "31 OpenEvolveTrainer Trace scalar train before 0 \n", + "32 PrioritySearch Trace scalar train after 0 \n", + "33 PrioritySearch Trace scalar train before 0 \n", + "34 TextGradTrainer Trace scalar train after 0 \n", + "35 TextGradTrainer Trace scalar train before 0 \n", + "36 OpenEvolveTrainer Trace scalar train after 1 \n", + "37 OpenEvolveTrainer Trace scalar train before 1 \n", + "38 PrioritySearch Trace scalar train after 1 \n", + "39 PrioritySearch Trace scalar train before 1 \n", + "40 TextGradTrainer Trace scalar train after 1 \n", + "41 TextGradTrainer Trace scalar train before 1 \n", + "42 OpenEvolveTrainer Trace scalar train after 2 \n", + "43 OpenEvolveTrainer Trace scalar train before 2 \n", + "44 PrioritySearch Trace scalar train after 2 \n", + "45 PrioritySearch Trace scalar train before 2 \n", + "46 TextGradTrainer Trace scalar train after 2 \n", + "47 TextGradTrainer Trace scalar train before 2 \n", + "\n", + " input expected output score \n", + "0 routing code for scarlet ticket A A 1.0 \n", + "1 routing code for scarlet ticket A S 0.0 \n", + "2 routing code for azure ticket B B 1.0 \n", + "3 routing code for azure ticket B A 0.0 \n", + "4 routing code for emerald ticket C C 1.0 \n", + "5 routing code for emerald ticket C E 0.0 \n", + "6 customer tier scarlet A A 1.0 \n", + "7 customer tier scarlet A S 0.0 \n", + "8 customer tier azure B B 1.0 \n", + "9 customer tier azure B A 0.0 \n", + "10 customer tier emerald C C 1.0 \n", + "11 customer tier emerald C E 0.0 \n", + "12 test-a 3.0 1.0 -2.0 \n", + "13 test-a 3.0 0.0 -3.0 \n", + "14 test-a 3.0 3.0 -0.0 \n", + "15 test-a 3.0 0.0 -3.0 \n", + "16 test-a 3.0 3.0 -0.0 \n", + "17 test-a 3.0 0.0 -3.0 \n", + "18 test-b 3.0 1.0 -2.0 \n", + "19 test-b 3.0 0.0 -3.0 \n", + "20 test-b 3.0 3.0 -0.0 \n", + "21 test-b 3.0 0.0 -3.0 \n", + "22 test-b 3.0 3.0 -0.0 \n", + "23 test-b 3.0 0.0 -3.0 \n", + "24 test-c 3.0 1.0 -2.0 \n", + "25 test-c 3.0 0.0 -3.0 \n", + "26 test-c 3.0 3.0 -0.0 \n", + "27 test-c 3.0 0.0 -3.0 \n", + "28 test-c 3.0 3.0 -0.0 \n", + "29 test-c 3.0 0.0 -3.0 \n", + "30 train-a 3.0 1.0 -2.0 \n", + "31 train-a 3.0 0.0 -3.0 \n", + "32 train-a 3.0 3.0 -0.0 \n", + "33 train-a 3.0 0.0 -3.0 \n", + "34 train-a 3.0 3.0 -0.0 \n", + "35 train-a 3.0 0.0 -3.0 \n", + "36 train-b 3.0 1.0 -2.0 \n", + "37 train-b 3.0 0.0 -3.0 \n", + "38 train-b 3.0 3.0 -0.0 \n", + "39 train-b 3.0 0.0 -3.0 \n", + "40 train-b 3.0 3.0 -0.0 \n", + "41 train-b 3.0 0.0 -3.0 \n", + "42 train-c 3.0 1.0 -2.0 \n", + "43 train-c 3.0 0.0 -3.0 \n", + "44 train-c 3.0 3.0 -0.0 \n", + "45 train-c 3.0 0.0 -3.0 \n", + "46 train-c 3.0 3.0 -0.0 \n", + "47 train-c 3.0 0.0 -3.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PrioritySearch: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n", + "TextGradTrainer: -3.0 -> 0.0 (test_delta=3.0, improvement=YES)\n", + "OpenEvolveTrainer: -3.0 -> -2.0 (test_delta=1.0, improvement=YES)\n", + "DSPyTrainer: 0.0 -> 1.0 (test_delta=1.0, improvement=YES)\n" + ] + } + ], + "source": [ + "summary_rows = []\n", + "example_rows = []\n", + "for item in smoke_results:\n", + " if \"result\" not in item:\n", + " summary_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"mode\": item[\"mode\"],\n", + " \"status\": item[\"status\"],\n", + " \"resolved_optimizer\": None,\n", + " \"before_value\": None,\n", + " \"after_value\": None,\n", + " \"train_examples\": None,\n", + " \"test_examples\": None,\n", + " \"before_train_score\": None,\n", + " \"after_train_score\": None,\n", + " \"train_delta\": None,\n", + " \"before_test_score\": None,\n", + " \"after_test_score\": None,\n", + " \"test_delta\": None,\n", + " \"improvement\": \"NO\",\n", + " \"error\": item[\"error\"],\n", + " })\n", + " continue\n", + "\n", + " result_status = item[\"result\"].get(\"status\")\n", + " before_train = item[\"before\"][\"train\"][\"mean_score\"]\n", + " after_train = item[\"after\"][\"train\"][\"mean_score\"]\n", + " before_test = item[\"before\"][\"test\"][\"mean_score\"]\n", + " after_test = item[\"after\"][\"test\"][\"mean_score\"]\n", + " test_delta = after_test - before_test\n", + " improved = result_status == \"ok\" and test_delta > 0\n", + " summary_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"mode\": item[\"mode\"],\n", + " \"status\": result_status,\n", + " \"resolved_optimizer\": item[\"result\"].get(\"resolved_optimizer\"),\n", + " \"before_value\": item[\"before\"][\"value\"],\n", + " \"after_value\": item[\"after\"][\"value\"],\n", + " \"train_examples\": item[\"train_examples\"],\n", + " \"test_examples\": item[\"test_examples\"],\n", + " \"before_train_score\": before_train,\n", + " \"after_train_score\": after_train,\n", + " \"train_delta\": after_train - before_train,\n", + " \"before_test_score\": before_test,\n", + " \"after_test_score\": after_test,\n", + " \"test_delta\": test_delta,\n", + " \"improvement\": \"YES\" if improved else \"NO\",\n", + " \"error\": item[\"result\"].get(\"error\"),\n", + " })\n", + " for split_name in (\"train\", \"test\"):\n", + " for phase in (\"before\", \"after\"):\n", + " for index, row in enumerate(item[phase][split_name][\"rows\"]):\n", + " example_rows.append({\n", + " \"trainer_id\": item[\"trainer_id\"],\n", + " \"task\": item[\"task\"],\n", + " \"split\": split_name,\n", + " \"phase\": phase,\n", + " \"example\": index,\n", + " \"input\": row[\"input\"],\n", + " \"expected\": row[\"expected\"],\n", + " \"output\": row[\"output\"],\n", + " \"score\": row[\"score\"],\n", + " })\n", + "\n", + "trainer_comparison = pd.DataFrame(summary_rows)\n", + "example_comparison = pd.DataFrame(example_rows)\n", + "\n", + "score_columns = [\"before_train_score\", \"after_train_score\", \"train_delta\", \"before_test_score\", \"after_test_score\", \"test_delta\"]\n", + "def mark_no_improvement(row: pd.Series) -> list[str]:\n", + " style = \"background-color: #ffd6d6; color: #9f0000; font-weight: 700\"\n", + " return [style if row.get(\"improvement\") != \"YES\" else \"\" for _ in row]\n", + "\n", + "styled_comparison = (\n", + " trainer_comparison.style\n", + " .apply(mark_no_improvement, axis=1)\n", + " .format({column: \"{:.3f}\" for column in score_columns})\n", + ")\n", + "display(styled_comparison)\n", + "\n", + "no_improvement = trainer_comparison[trainer_comparison[\"improvement\"] != \"YES\"]\n", + "if no_improvement.empty:\n", + " display(HTML(\"
All trainers improved on held-out examples.
\"))\n", + "else:\n", + " names = \", \".join(no_improvement[\"trainer_id\"].astype(str).tolist())\n", + " display(HTML(f\"
NO HELD-OUT IMPROVEMENT: {names}
\"))\n", + "\n", + "if example_rows:\n", + " display(example_comparison.sort_values([\"task\", \"split\", \"example\", \"trainer_id\", \"phase\"]).reset_index(drop=True))\n", + "else:\n", + " print(\"No per-example outputs were produced because all real trainer runs errored.\")\n", + "\n", + "for _, row in trainer_comparison.iterrows():\n", + " print(\n", + " f\"{row['trainer_id']}: {row['before_test_score']} -> {row['after_test_score']} \"\n", + " f\"(test_delta={row['test_delta']}, improvement={row['improvement']})\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "72ec5773", + "metadata": {}, + "source": [ + "## 9. Practical reading guide\n", + "\n", + "Read red rows first. A red row means the real trainer path either failed or did not improve the held-out score. Then inspect the per-example table to see whether the trainer changed the parameter/instruction and whether the change generalized beyond the three training examples." + ] + }, + { + "cell_type": "markdown", + "id": "83656ee2", + "metadata": {}, + "source": [ + "## 10. What counts as success\n", + "\n", + "### Strong success\n", + "- focused tests pass\n", + "- discovery shows the comparison trainers\n", + "- real `opto.optimizers.textgrad.TextGrad`, `openevolve.run_evolution`, and `dspy.LM` import successfully\n", + "- all real trainer rows complete\n", + "- no rows are highlighted red\n", + "\n", + "### Needs follow-up\n", + "- a trainer reports an error row\n", + "- a trainer completes but is highlighted red because held-out score did not improve\n", + "- per-example outputs show memorization or no meaningful parameter/instruction change" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/setup.py b/setup.py index a808cda..769805e 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ "tensorboardX", "tensorboard", "pyyaml", + "nest-asyncio>=1.6.0", ] # Optional dependencies for external trainers in trace_bench/trainers/. diff --git a/tests/test_cli_external_trainer_validation.py b/tests/test_cli_external_trainer_validation.py new file mode 100644 index 0000000..b86f303 --- /dev/null +++ b/tests/test_cli_external_trainer_validation.py @@ -0,0 +1,37 @@ +from trace_bench.cli import _validate_trainer_params +from trace_bench.config import TrainerConfig + + +class _FakeExternalTrainer: + USES_TRACE_OPTIMIZER = False + + def train( + self, + guide, + train_dataset, + *, + iterations: int = 1, + ensure_improvement: bool = True, + verbose: bool = False, + **_kwargs, + ): + return {} + + +def test_validate_trainer_params_uses_train_signature(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer) + trainer = TrainerConfig( + id="OpenEvolveTrainer", + params_variants=[{"iterations": 2, "ensure_improvement": False}], + ) + errors = [] + _validate_trainer_params(trainer, errors) + assert errors == [] + + +def test_validate_trainer_params_rejects_unknown_kwarg(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.cli._resolve_algorithm", lambda _trainer_id: _FakeExternalTrainer) + trainer = TrainerConfig(id="OpenEvolveTrainer", params_variants=[{"unknown": 1}]) + errors = [] + _validate_trainer_params(trainer, errors) + assert errors == ["unknown trainer kwarg 'unknown' for OpenEvolveTrainer"] diff --git a/tests/test_dspy_trainer.py b/tests/test_dspy_trainer.py new file mode 100644 index 0000000..63cd74a --- /dev/null +++ b/tests/test_dspy_trainer.py @@ -0,0 +1,28 @@ +import itertools +from typing import Any + +import pytest + + +def test_dspy_trainer_restores_empty_global_lm(monkeypatch: pytest.MonkeyPatch) -> None: + dspy = pytest.importorskip("dspy") + trainer_module = pytest.importorskip("trace_bench.trainers.dspy_trainer") + from dspy.utils import DummyLM + + previous_lm = getattr(dspy.settings, "lm", None) + dspy.configure(lm=None) + trainer = trainer_module.DSPyTrainer(object()) + + def _train_inner(**_kwargs: Any) -> dict[str, str]: + return {"status": "ok"} + + monkeypatch.setattr(trainer, "_train_inner", _train_inner) + try: + trainer.train( + guide=object(), + train_dataset={"inputs": [], "infos": []}, + dspy_lm=DummyLM(itertools.cycle([{"answer": "ok"}])), + ) + assert getattr(dspy.settings, "lm", None) is None + finally: + dspy.configure(lm=previous_lm) diff --git a/tests/test_external_trainer_discovery.py b/tests/test_external_trainer_discovery.py new file mode 100644 index 0000000..78bc5e9 --- /dev/null +++ b/tests/test_external_trainer_discovery.py @@ -0,0 +1,34 @@ +import importlib +import sys +import types + +from trace_bench.registry import discover_trainers + + +def _install_fake_external_dependencies(monkeypatch) -> None: + fake_textgrad_module = types.ModuleType("opto.optimizers.textgrad") + + class _FakeTextGrad: + def __init__(self, parameters, **_kwargs) -> None: + self.parameters = list(parameters) + + fake_textgrad_module.TextGrad = _FakeTextGrad + monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_textgrad_module) + + fake_openevolve_module = types.ModuleType("openevolve") + fake_openevolve_module.run_evolution = lambda **_kwargs: {"best_code": 'candidate = {}'} + monkeypatch.setitem(sys.modules, "openevolve", fake_openevolve_module) + + +def test_discover_trainers_lists_new_external_trainers_when_dependencies_are_available(monkeypatch) -> None: + _install_fake_external_dependencies(monkeypatch) + + import trace_bench.trainers.textgrad_trainer as textgrad_trainer + import trace_bench.trainers.openevolve_trainer as openevolve_trainer + + importlib.reload(textgrad_trainer) + importlib.reload(openevolve_trainer) + + specs = {spec.id: spec for spec in discover_trainers()} + assert specs["TextGradTrainer"].available is True + assert specs["OpenEvolveTrainer"].available is True diff --git a/tests/test_external_utils.py b/tests/test_external_utils.py new file mode 100644 index 0000000..d72a41e --- /dev/null +++ b/tests/test_external_utils.py @@ -0,0 +1,18 @@ +from trace_bench.trainers._external_utils import apply_parameter_updates + + +class _ReadOnlyDataParam: + def __init__(self, value: str) -> None: + self._data = value + + @property + def data(self) -> str: + return self._data + + +def test_apply_parameter_updates_falls_back_to_private_data_slot_when_data_property_has_no_setter() -> None: + parameter = _ReadOnlyDataParam("before") + + apply_parameter_updates({parameter: "after"}) + + assert parameter.data == "after" diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py new file mode 100644 index 0000000..1f2aaa0 --- /dev/null +++ b/tests/test_llm_utils.py @@ -0,0 +1,19 @@ +import pytest + +from trace_bench.llm import openai_compatible_model_name + + +def test_openai_compatible_model_name_strips_openrouter_prefix() -> None: + assert ( + openai_compatible_model_name("openrouter/openai/gpt-4o-mini") + == "openai/gpt-4o-mini" + ) + + +def test_openai_compatible_model_name_keeps_other_model_names() -> None: + assert openai_compatible_model_name("gpt-4o-mini") == "gpt-4o-mini" + + +def test_openai_compatible_model_name_requires_string() -> None: + with pytest.raises(TypeError, match="model must be a string"): + openai_compatible_model_name(None) # type: ignore[arg-type] diff --git a/tests/test_openevolve_trainer.py b/tests/test_openevolve_trainer.py new file mode 100644 index 0000000..22d26b9 --- /dev/null +++ b/tests/test_openevolve_trainer.py @@ -0,0 +1,209 @@ +import asyncio +import importlib +import os +import sys +import tempfile +import types + +import pytest + + +class _DummyParam: + def __init__(self, name: str, value: str) -> None: + self.name = name + self.py_name = name + self.data = value + self.trainable = True + + +class _DummyAgent: + def __init__(self, greeting: str = "Hi") -> None: + self.greeting = _DummyParam("greeting", greeting) + + def parameters(self): + return [self.greeting] + + def __call__(self, query: str) -> str: + name = query.split()[-1].strip("!.?") + return f"{self.greeting.data}, {name}!" + + +class _DummyGuide: + def __call__(self, task_input: str, response: str, task_info: str): + del task_input + return (1.0 if response == task_info else 0.0), f"expected {task_info}" + + +def _install_fake_openevolve_config(monkeypatch: pytest.MonkeyPatch) -> None: + fake_config_module = types.ModuleType("openevolve.config") + + class _FakeDatabaseConfig: + def __init__(self) -> None: + self.population_size = 1000 + self.num_islands = 5 + + class _FakeLLMConfig: + def __init__(self) -> None: + self.api_base = "https://api.openai.com/v1" + self.api_key = None + self.max_tokens = 4096 + self.temperature = 0.7 + self.timeout = 60 + self.retries = 3 + self.retry_delay = 5 + self.models = [] + self.evaluator_models = [] + + class _FakeConfig: + def __init__(self, max_iterations: int, random_seed: int | None) -> None: + self.max_iterations = max_iterations + self.random_seed = random_seed + self.database = _FakeDatabaseConfig() + self.llm = _FakeLLMConfig() + + class _FakeLLMModelConfig: + def __init__(self, **kwargs: object) -> None: + self.kwargs = kwargs + + fake_config_module.Config = _FakeConfig + fake_config_module.LLMModelConfig = _FakeLLMModelConfig + monkeypatch.setitem(sys.modules, "openevolve.config", fake_config_module) + + +def _import_openevolve_trainer(monkeypatch: pytest.MonkeyPatch, best_code: str, capture: dict[str, object] | None = None) -> types.ModuleType: + fake_module = types.ModuleType("openevolve") + + def _run_evolution(*, initial_program, evaluator, iterations, config=None, **_kwargs): + if capture is not None: + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as handle: + handle.write(best_code) + candidate_path = handle.name + try: + capture["evaluation"] = evaluator(candidate_path) + finally: + os.unlink(candidate_path) + capture["config"] = config + capture["iterations"] = iterations + del initial_program + return types.SimpleNamespace(best_code=best_code) + + fake_module.run_evolution = _run_evolution + monkeypatch.setitem(sys.modules, "openevolve", fake_module) + _install_fake_openevolve_config(monkeypatch) + sys.modules.pop("trace_bench.trainers.openevolve_trainer", None) + return importlib.import_module("trace_bench.trainers.openevolve_trainer") + + +def test_openevolve_trainer_updates_parameter(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + result = trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "openevolve.run_evolution" + assert trainer.param.greeting.data == "Hello" + + +def test_openevolve_trainer_runs_inside_active_event_loop(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + + async def _run_training() -> dict[str, object]: + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + return trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + + result = asyncio.run(_run_training()) + assert result["status"] == "ok" + + +def test_openevolve_trainer_rejects_worse_candidate(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Bad"}\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=True, + ) + assert trainer.param.greeting.data == "Hello" + + +def test_openevolve_trainer_rejects_invalid_candidate_program(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='print("bad candidate")\n', + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hello")) + with pytest.raises(ValueError, match="Candidate program"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ensure_improvement=False, + ) + + +def test_openevolve_trainer_requires_trainable_parameters(monkeypatch) -> None: + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + ) + + class _NoTrainables: + def parameters(self): + return [] + + trainer = trainer_module.OpenEvolveTrainer(_NoTrainables()) + with pytest.raises(ValueError, match="no trainable parameters"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=1, + ) + + +def test_openevolve_trainer_returns_combined_score_and_configures_population(monkeypatch) -> None: + capture: dict = {} + trainer_module = _import_openevolve_trainer( + monkeypatch, + best_code='candidate = {"greeting": "Hello"}\n', + capture=capture, + ) + trainer = trainer_module.OpenEvolveTrainer(_DummyAgent("Hi")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + iterations=2, + population_size=12, + num_islands=3, + model="openrouter/openai/gpt-4o-mini", + api_key="test-key", + ensure_improvement=False, + ) + assert capture["evaluation"]["combined_score"] == capture["evaluation"]["score"] + assert capture["config"].database.population_size == 12 + assert capture["config"].database.num_islands == 3 + assert capture["config"].llm.models[0].kwargs["name"] == "openai/gpt-4o-mini" diff --git a/tests/test_resolve_external_trainers.py b/tests/test_resolve_external_trainers.py new file mode 100644 index 0000000..9a7c7d6 --- /dev/null +++ b/tests/test_resolve_external_trainers.py @@ -0,0 +1,16 @@ +from trace_bench.resolve import resolve_trainer_kwargs + + +def test_resolve_trainer_kwargs_does_not_inject_gepa_defaults_for_external_trainers() -> None: + assert resolve_trainer_kwargs({}, "TextGradTrainer") == {} + assert resolve_trainer_kwargs({"iterations": 3}, "OpenEvolveTrainer") == { + "iterations": 3 + } + + +def test_resolve_trainer_kwargs_preserves_gepa_defaults() -> None: + resolved = resolve_trainer_kwargs({}, "GEPA-UCB") + assert resolved["num_search_iterations"] == 1 + assert resolved["train_batch_size"] == 2 + assert resolved["merge_every"] == 2 + assert resolved["pareto_subset_size"] == 2 diff --git a/tests/test_runner_external_mode.py b/tests/test_runner_external_mode.py new file mode 100644 index 0000000..3b5b4a8 --- /dev/null +++ b/tests/test_runner_external_mode.py @@ -0,0 +1,76 @@ +from trace_bench.config import TrainerConfig +from trace_bench.runner import _train_bundle + + +class _DummyAgent: + def parameters(self): + return [] + + def __call__(self, query): + return query + + +class _DummyGuide: + def __call__(self, task_input, response, task_info): + return 1.0, "ok" + + +class _FakeExternalTrainer: + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent, logger=None): + del logger + self.param = agent + + def train(self, guide, train_dataset, mode="real", **_kwargs): + del guide, train_dataset + return {"status": "ok", "resolved_optimizer": mode} + + +def test_runner_passes_mode_to_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeExternalTrainer) + bundle = { + "param": _DummyAgent(), + "guide": _DummyGuide(), + "train_dataset": {"inputs": ["x"], "infos": ["y"]}, + "optimizer_kwargs": {}, + "metadata": {}, + } + trainer = TrainerConfig(id="FakeExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "stub" + + +class _FakeNonDSPyExternalTrainer(_FakeExternalTrainer): + FRAMEWORK = "trace" + + def train(self, guide, train_dataset, mode="real", **kwargs): + del guide, train_dataset, mode + return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")} + + +class _FakeDSPyExternalTrainer(_FakeExternalTrainer): + FRAMEWORK = "dspy" + + def train(self, guide, train_dataset, mode="real", **kwargs): + del guide, train_dataset, mode + return {"status": "ok", "resolved_optimizer": kwargs.get("dspy_lm", "absent")} + + +def test_runner_does_not_inject_dspy_stub_into_non_dspy_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeNonDSPyExternalTrainer) + bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}} + trainer = TrainerConfig(id="FakeNonDSPyExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "absent" + + +def test_runner_injects_dspy_stub_only_for_dspy_external_trainers(monkeypatch) -> None: + monkeypatch.setattr("trace_bench.runner._resolve_algorithm", lambda _name: _FakeDSPyExternalTrainer) + bundle = {"param": _DummyAgent(), "guide": _DummyGuide(), "train_dataset": {"inputs": ["x"], "infos": ["y"]}, "optimizer_kwargs": {}, "metadata": {}} + trainer = TrainerConfig(id="FakeDSPyExternalTrainer", logger="none") + result = _train_bundle(bundle=bundle, trainer_spec=trainer, params={}, mode="stub") + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "stub" diff --git a/tests/test_textgrad_trainer.py b/tests/test_textgrad_trainer.py new file mode 100644 index 0000000..f1ccc41 --- /dev/null +++ b/tests/test_textgrad_trainer.py @@ -0,0 +1,121 @@ +import importlib +import sys +import types + +import pytest + + +class _DummyParam: + def __init__(self, name: str, value: str) -> None: + self.name = name + self.py_name = name + self.data = value + self.trainable = True + + +class _DummyAgent: + def __init__(self, greeting: str = "Hi") -> None: + self.greeting = _DummyParam("greeting", greeting) + + def parameters(self): + return [self.greeting] + + def __call__(self, query: str) -> str: + name = query.split()[-1].strip("!.?") + return f"{self.greeting.data}, {name}!" + + +class _DummyGuide: + def __call__(self, task_input: str, response: str, task_info: str): + del task_input + return (1.0 if response == task_info else 0.0), f"expected {task_info}" + + +def _import_textgrad_trainer(monkeypatch: pytest.MonkeyPatch, proposal: str, capture: dict[str, object] | None = None) -> types.ModuleType: + fake_module = types.ModuleType("opto.optimizers.textgrad") + + class _FakeTextGrad: + def __init__(self, parameters, **_kwargs) -> None: + self.parameters = list(parameters) + if capture is not None: + capture["init_kwargs"] = _kwargs + + def zero_feedback(self) -> None: + return None + + def backward(self, target, feedback) -> None: + del target, feedback + return None + + def step(self, bypassing=False, verbose=False): + del bypassing, verbose + return {self.parameters[0]: proposal} + + fake_module.TextGrad = _FakeTextGrad + monkeypatch.setitem(sys.modules, "opto.optimizers.textgrad", fake_module) + sys.modules.pop("trace_bench.trainers.textgrad_trainer", None) + return importlib.import_module("trace_bench.trainers.textgrad_trainer") + + +def test_textgrad_trainer_updates_parameter(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello") + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi")) + result = trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + num_epochs=1, + batch_size=1, + ensure_improvement=False, + ) + assert result["status"] == "ok" + assert result["resolved_optimizer"] == "opto.optimizers.textgrad.TextGrad" + assert trainer.param.greeting.data == "Hello" + + +def test_textgrad_trainer_rejects_worse_candidate(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Bad") + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hello")) + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + num_epochs=1, + batch_size=1, + ensure_improvement=True, + ) + assert trainer.param.greeting.data == "Hello" + + +def test_textgrad_trainer_requires_trainable_parameters(monkeypatch) -> None: + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello") + + class _NoTrainables: + def parameters(self): + return [] + + trainer = trainer_module.TextGradTrainer(_NoTrainables()) + with pytest.raises(ValueError, match="no trainable parameters"): + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + ) + + +def test_textgrad_trainer_forwards_llm(monkeypatch: pytest.MonkeyPatch) -> None: + """TextGradTrainer forwards explicit LLM objects to NewTrace TextGrad.""" + capture: dict[str, object] = {} + trainer_module = _import_textgrad_trainer(monkeypatch, proposal="Hello", capture=capture) + trainer = trainer_module.TextGradTrainer(_DummyAgent("Hi")) + llm = object() + trainer.train( + guide=_DummyGuide(), + train_dataset={"inputs": ["Hello Sam"], "infos": ["Hello, Sam!"]}, + mode="real", + ensure_improvement=False, + llm=llm, + ) + init_kwargs = capture["init_kwargs"] + assert isinstance(init_kwargs, dict) + assert init_kwargs["llm"] is llm diff --git a/trace_bench/cli.py b/trace_bench/cli.py index 853b9b8..8813326 100644 --- a/trace_bench/cli.py +++ b/trace_bench/cli.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import inspect import json from datetime import datetime from pathlib import Path @@ -15,7 +16,7 @@ load_task_bundle, ) from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs -from trace_bench.runner import BenchRunner, _has_trainables +from trace_bench.runner import BenchRunner, _has_trainables, _resolve_algorithm from trace_bench.artifacts import init_run_dir, write_manifest from trace_bench.ui import launch_ui @@ -67,6 +68,7 @@ def _task_in_bench(task_key: str, bench: str | None) -> bool: "num_iters", "num_search_iterations", "train_batch_size", + "batch_size", "merge_every", "pareto_subset_size", "ps_steps", @@ -92,6 +94,25 @@ def _resolve_symbol(module_name: str, symbol: str) -> bool: return False +def _allowed_trainer_kwargs_for(trainer_id: str) -> set[str]: + """Return the trainer kwargs accepted by strict validation for a trainer id.""" + allowed = set(_ALLOWED_TRAINER_KWARGS) + resolved = _resolve_algorithm(trainer_id) + if not isinstance(resolved, type): + return allowed + + try: + signature = inspect.signature(resolved.train) + except (TypeError, ValueError): + return allowed + + ignored = {"self", "guide", "train_dataset", "validate_dataset", "test_dataset", "mode"} + for name, parameter in signature.parameters.items(): + if name in ignored: + continue + if parameter.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY): + allowed.add(name) + return allowed def _normalize_logger_override(raw: str | None) -> str | None: @@ -128,9 +149,10 @@ def _default_timeout(mode: str) -> float: def _validate_trainer_params(trainer, errors: list[str]) -> None: + allowed_kwargs = _allowed_trainer_kwargs_for(trainer.id) for params in trainer.params_variants or [{}]: for key in params.keys(): - if key not in _ALLOWED_TRAINER_KWARGS: + if key not in allowed_kwargs: errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}") if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer): diff --git a/trace_bench/llm.py b/trace_bench/llm.py new file mode 100644 index 0000000..b3f6034 --- /dev/null +++ b/trace_bench/llm.py @@ -0,0 +1,10 @@ +from __future__ import annotations + + +def openai_compatible_model_name(model: str) -> str: + """Return the model identifier expected by OpenAI-compatible clients.""" + if not isinstance(model, str): + raise TypeError("model must be a string.") + if model.startswith("openrouter/"): + return model.split("/", 1)[1] + return model diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py index e285341..475b7e0 100644 --- a/trace_bench/resolve.py +++ b/trace_bench/resolve.py @@ -4,34 +4,56 @@ _FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"} +_GEPA_TRAINERS = {"GEPA-Base", "GEPA-UCB", "GEPA-Beam"} def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: + """Return default kwargs for built-in Trace search trainers only.""" if algo_name == "PrioritySearch": - return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) + return dict( + num_epochs=1, + num_steps=1, + num_batches=1, + num_candidates=2, + num_proposals=2, + ) if algo_name == "GEPA-Base": return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) - # GEPA-UCB and GEPA-Beam use num_search_iterations - return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + if algo_name in {"GEPA-UCB", "GEPA-Beam"}: + return dict( + num_search_iterations=1, + train_batch_size=2, + merge_every=2, + pareto_subset_size=2, + ) + return {} def _param_alias_map(algo_name: str) -> Dict[str, str]: - base = { + alias_map = { "threads": "num_threads", - "ps_steps": "num_steps", - "ps_batches": "num_batches", - "ps_candidates": "num_candidates", - "ps_proposals": "num_proposals", - "ps_mem_update": "memory_update_frequency", - "gepa_train_bs": "train_batch_size", - "gepa_merge_every": "merge_every", - "gepa_pareto_subset": "pareto_subset_size", } - if algo_name == "GEPA-Base": - base["gepa_iters"] = "num_iters" - else: - base["gepa_iters"] = "num_search_iterations" - return base + if algo_name == "PrioritySearch": + alias_map.update( + { + "ps_steps": "num_steps", + "ps_batches": "num_batches", + "ps_candidates": "num_candidates", + "ps_proposals": "num_proposals", + "ps_mem_update": "memory_update_frequency", + "batch_size": "train_batch_size", + } + ) + if algo_name in _GEPA_TRAINERS: + alias_map.update( + { + "gepa_train_bs": "train_batch_size", + "gepa_merge_every": "merge_every", + "gepa_pareto_subset": "pareto_subset_size", + } + ) + alias_map["gepa_iters"] = "num_iters" if algo_name == "GEPA-Base" else "num_search_iterations" + return alias_map def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: diff --git a/trace_bench/runner.py b/trace_bench/runner.py index c5bc838..2763dab 100644 --- a/trace_bench/runner.py +++ b/trace_bench/runner.py @@ -512,13 +512,14 @@ def _dummy_response(*_args, **_kwargs): uses_trace_optimizer = getattr(algo, "USES_TRACE_OPTIMIZER", True) - # For DSPy-style external trainers: propagate mode='stub' as - # dspy_lm='stub' so they configure DummyLM without requiring an explicit - # dspy_lm param in the config. OpenTrace trainers do not all accept this - # keyword, so keep the injection limited to external trainers that manage - # their own optimization loop. + if not uses_trace_optimizer: + kwargs.setdefault("mode", mode) + + # Keep backward-compatible DSPy stub support, but do not leak DSPy-only + # kwargs into unrelated external trainers. if mode == "stub" and not uses_trace_optimizer: - kwargs.setdefault("dspy_lm", "stub") + if getattr(algo, "FRAMEWORK", None) == "dspy": + kwargs.setdefault("dspy_lm", "stub") # Pass through multi-objective config from bundle if present objective_config = bundle.get("objective_config") diff --git a/trace_bench/trainers/README_openevolve_trainer.md b/trace_bench/trainers/README_openevolve_trainer.md new file mode 100644 index 0000000..e719eab --- /dev/null +++ b/trace_bench/trainers/README_openevolve_trainer.md @@ -0,0 +1,8 @@ +# OpenEvolveTrainer + +`OpenEvolveTrainer` is an external Trace-Bench trainer wrapper for `openevolve.run_evolution`. + +- Evolves a **safe literal** candidate mapping of trainable parameter values. +- Never executes candidate code via `exec`. +- Parses candidates using `ast.parse` and `ast.literal_eval` only. +- Can optionally keep only improving updates (`ensure_improvement=True`). diff --git a/trace_bench/trainers/README_textgrad_trainer.md b/trace_bench/trainers/README_textgrad_trainer.md new file mode 100644 index 0000000..13621fa --- /dev/null +++ b/trace_bench/trainers/README_textgrad_trainer.md @@ -0,0 +1,8 @@ +# TextGradTrainer + +`TextGradTrainer` is an external Trace-Bench trainer wrapper for `opto.optimizers.textgrad.TextGrad`. + +- Thin wrapper around NewTrace TextGrad. +- Supports `mode=stub` and `mode=real`. +- Uses trainable Trace parameters only. +- Can optionally keep only improving updates (`ensure_improvement=True`). diff --git a/trace_bench/trainers/_external_utils.py b/trace_bench/trainers/_external_utils.py new file mode 100644 index 0000000..fc4ebb8 --- /dev/null +++ b/trace_bench/trainers/_external_utils.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from copy import deepcopy +import importlib +from typing import Any, Dict, List, Mapping, Sequence, Tuple + + +def collect_trainable_parameters(model: Any) -> List[Any]: + """Return trainable parameter-like objects from a model or standalone parameter.""" + if hasattr(model, "parameters") and callable(model.parameters): + parameters = [parameter for parameter in model.parameters() if getattr(parameter, "trainable", False)] + if parameters: + return list(parameters) + raise ValueError("Model.parameters() returned no trainable parameters.") + if getattr(model, "trainable", False) and hasattr(model, "data"): + return [model] + raise TypeError("Expected a model with parameters() or a standalone trainable parameter-like object.") + + +def coerce_like(example_value: Any, candidate_value: Any) -> Any: + """Coerce a candidate value to the same literal-like type as the current parameter value.""" + if isinstance(example_value, bool): + if not isinstance(candidate_value, bool): + raise TypeError("Expected a boolean candidate value.") + return candidate_value + if isinstance(example_value, int) and not isinstance(example_value, bool): + if isinstance(candidate_value, bool): + raise TypeError("Expected an integer candidate value.") + if isinstance(candidate_value, int): + return candidate_value + if isinstance(candidate_value, float) and candidate_value.is_integer(): + return int(candidate_value) + raise TypeError("Expected an integer candidate value.") + if isinstance(example_value, float): + if isinstance(candidate_value, bool) or not isinstance(candidate_value, (int, float)): + raise TypeError("Expected a numeric candidate value.") + return float(candidate_value) + if isinstance(example_value, str): + if not isinstance(candidate_value, str): + raise TypeError("Expected a string candidate value.") + return candidate_value + if isinstance(example_value, list): + if not isinstance(candidate_value, list): + raise TypeError("Expected a list candidate value.") + return candidate_value + if isinstance(example_value, tuple): + if not isinstance(candidate_value, (list, tuple)): + raise TypeError("Expected a sequence candidate value.") + return tuple(candidate_value) + if isinstance(example_value, dict): + if not isinstance(candidate_value, dict): + raise TypeError("Expected a mapping candidate value.") + return candidate_value + raise TypeError(f"Unsupported trainable parameter value type: {type(example_value).__name__}.") + + +def snapshot_parameter_values(parameters: Sequence[Any]) -> Dict[Any, Any]: + """Deep-copy the current values of the provided parameters.""" + return {parameter: deepcopy(getattr(parameter, "data")) for parameter in parameters} + + +def _set_parameter_value(parameter: Any, value: Any) -> None: + """Set a parameter-like object's value in a way that works across Trace variants.""" + try: + setattr(parameter, "data", deepcopy(value)) + return + except Exception: + pass + if hasattr(parameter, "_data"): + setattr(parameter, "_data", deepcopy(value)) + return + raise TypeError("Parameter object does not expose a writable data field.") + + +def restore_parameter_values(snapshot: Mapping[Any, Any]) -> None: + """Restore a parameter snapshot created by snapshot_parameter_values().""" + for parameter, value in snapshot.items(): + _set_parameter_value(parameter, value) + + +def apply_parameter_updates(update_dict: Mapping[Any, Any]) -> None: + """Apply candidate parameter updates in place.""" + for parameter, value in update_dict.items(): + _set_parameter_value(parameter, value) + + +def score_model_on_dataset(agent: Any, guide: Any, dataset: Dict[str, Any], *, suppress_exceptions: bool = False) -> Tuple[float, List[str]]: + """Evaluate an agent on a Trace-Bench dataset and return mean score plus feedback strings.""" + inputs = dataset.get("inputs") or [] + infos = dataset.get("infos") or dataset.get("info") or [] + if len(inputs) != len(infos): + raise ValueError("Dataset 'inputs' and 'infos' must have the same length.") + if not inputs: + raise ValueError("Dataset must contain at least one example.") + + scores: List[float] = [] + feedbacks: List[str] = [] + for index, (task_input, task_info) in enumerate(zip(inputs, infos)): + try: + output = agent(task_input) + response = getattr(output, "data", output) + score, feedback = guide(task_input, response, task_info) + scores.append(float(score)) + feedbacks.append(str(feedback)) + except Exception as exc: + if not suppress_exceptions: + raise + scores.append(float("-inf")) + feedbacks.append(f"evaluation_error[{index}]: {type(exc).__name__}") + + return sum(scores) / len(scores), feedbacks + + +def summarize_feedback(feedbacks: Sequence[str], *, max_items: int = 3) -> str: + """Return a compact textual summary of the first few feedback strings.""" + items = [str(item) for item in feedbacks[:max_items]] + return " | ".join(items) + + +def resolve_external_trainer_base() -> type: + """Resolve the most compatible trainer base across OpenTrace variants.""" + try: + module = importlib.import_module("opto.trainer.algorithms.algorithm") + except Exception: + return object + + for class_name in ("Trainer", "AbstractAlgorithm", "Algorithm", "AlgorithmBase"): + trainer_base = getattr(module, class_name, None) + if isinstance(trainer_base, type): + return trainer_base + + return object diff --git a/trace_bench/trainers/openevolve_trainer.py b/trace_bench/trainers/openevolve_trainer.py new file mode 100644 index 0000000..9171e40 --- /dev/null +++ b/trace_bench/trainers/openevolve_trainer.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import ast +import asyncio +import inspect +import os +from functools import partial +from pathlib import Path +from pprint import pformat +from threading import RLock +from typing import Any, Dict, List, Optional, Union + +try: + from openevolve import run_evolution as _run_evolution +except Exception as exc: + raise ImportError("OpenEvolveTrainer requires the optional 'openevolve' package.") from exc + +from trace_bench.llm import openai_compatible_model_name +from trace_bench.trainers._external_utils import apply_parameter_updates, collect_trainable_parameters, coerce_like, resolve_external_trainer_base, restore_parameter_values, score_model_on_dataset, snapshot_parameter_values, summarize_feedback + +_TrainerBase = resolve_external_trainer_base() +_EVALUATION_LOCK = RLock() + +def _validate_literal_value(value: Any) -> None: + """Ensure a parameter value round-trips through repr() and ast.literal_eval().""" + try: + ast.literal_eval(repr(value)) + except Exception as exc: + raise TypeError(f"OpenEvolveTrainer supports only literal-like parameter values; got {type(value).__name__}.") from exc + +def _serialize_candidate_program(parameters: List[Any]) -> str: + """Serialize the current trainable parameter values to a safe Python literal program.""" + payload: Dict[str, Any] = {} + for parameter in parameters: + value = getattr(parameter, "data") + _validate_literal_value(value) + payload[parameter.py_name] = value + return "candidate = " + pformat(payload, sort_dicts=True) + "\n" + +def _parse_candidate_program(program_text: str, parameters: List[Any]) -> Dict[Any, Any]: + """Parse a candidate program and coerce it back into parameter values.""" + try: + syntax_tree = ast.parse(program_text, mode="exec") + except SyntaxError as exc: + raise ValueError("Candidate program must be valid Python.") from exc + if len(syntax_tree.body) != 1 or not isinstance(syntax_tree.body[0], ast.Assign): + raise ValueError("Candidate program must contain exactly one assignment to 'candidate'.") + assignment = syntax_tree.body[0] + if len(assignment.targets) != 1 or not isinstance(assignment.targets[0], ast.Name) or assignment.targets[0].id != "candidate": + raise ValueError("Candidate program must assign a literal mapping to 'candidate'.") + try: + candidate_mapping = ast.literal_eval(assignment.value) + except Exception as exc: + raise ValueError("Candidate mapping must be parseable via ast.literal_eval().") from exc + if not isinstance(candidate_mapping, dict): + raise ValueError("Candidate mapping must be a dict.") + expected_names = {parameter.py_name for parameter in parameters} + if set(candidate_mapping.keys()) != expected_names: + raise ValueError("Candidate mapping keys must exactly match the trainable parameter names.") + update_dict: Dict[Any, Any] = {} + for parameter in parameters: + update_dict[parameter] = coerce_like(getattr(parameter, "data"), candidate_mapping[parameter.py_name]) + return update_dict + +def _extract_best_code(result: Any) -> str: + """Extract the best candidate program text from an OpenEvolve result object.""" + if isinstance(result, dict): + for key in ("best_code", "code", "best_program"): + value = result.get(key) + if isinstance(value, str): + return value + for attribute in ("best_code", "code", "best_program"): + value = getattr(result, attribute, None) + if isinstance(value, str): + return value + raise ValueError("run_evolution did not return a best_code-like string.") + +def _filter_supported_kwargs(function: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Drop kwargs that are not accepted by the target callable.""" + try: + signature = inspect.signature(function) + except (TypeError, ValueError): + return dict(kwargs) + if any(parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in signature.parameters.values()): + return dict(kwargs) + return {key: value for key, value in kwargs.items() if key in signature.parameters} + +def _run_evolution_compatible(kwargs: Dict[str, Any]) -> Any: + """Run OpenEvolve even when the caller already owns an asyncio event loop.""" + try: + asyncio.get_running_loop() + except RuntimeError: + return _run_evolution(**kwargs) + + try: + import nest_asyncio + except ImportError as exc: + raise RuntimeError( + "OpenEvolveTrainer requires nest_asyncio when called from an active asyncio event loop." + ) from exc + nest_asyncio.apply() + return _run_evolution(**kwargs) + +def _build_openevolve_config(*, model: Optional[str], api_base: Optional[str], api_key: Optional[str], api_key_env: str, max_tokens: int, temperature: Optional[float], iterations: int, seed: Optional[int], population_size: Optional[int], num_islands: Optional[int]) -> Any: + """Build an OpenEvolve config for OpenAI-compatible providers when requested.""" + if api_key_env and not isinstance(api_key_env, str): + raise TypeError("api_key_env must be a string.") + if population_size is not None and population_size < 1: + raise ValueError("population_size must be at least 1.") + if num_islands is not None and num_islands < 1: + raise ValueError("num_islands must be at least 1.") + resolved_api_key = api_key or (os.environ.get(api_key_env) if api_key_env else None) + resolved_api_base = api_base or os.environ.get("OPENAI_BASE_URL") or os.environ.get("OPENAI_API_BASE") + resolved_model = model or os.environ.get("TRACE_LITELLM_MODEL") + if not any((resolved_api_key, resolved_api_base, resolved_model, population_size, num_islands)): + return None + if not resolved_model: + resolved_model = "gpt-4o-mini" + if max_tokens < 1: + raise ValueError("max_tokens must be at least 1.") + + from openevolve.config import Config, LLMModelConfig + + config = Config(max_iterations=iterations, random_seed=seed) + if population_size is not None: + config.database.population_size = population_size + if num_islands is not None: + config.database.num_islands = num_islands + if resolved_api_base: + config.llm.api_base = resolved_api_base + if resolved_api_key: + config.llm.api_key = resolved_api_key + config.llm.max_tokens = max_tokens + config.llm.temperature = temperature + model_config = LLMModelConfig( + name=openai_compatible_model_name(resolved_model), + api_base=config.llm.api_base, + api_key=config.llm.api_key, + temperature=temperature, + max_tokens=max_tokens, + timeout=config.llm.timeout, + retries=config.llm.retries, + retry_delay=config.llm.retry_delay, + random_seed=seed, + ) + config.llm.models = [model_config] + config.llm.evaluator_models = [model_config] + return config + +class OpenEvolveTrainer(_TrainerBase): + """Trace-Bench wrapper around OpenEvolve using safe literal parameter serialization.""" + + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None: + del optimizer + self.param = agent + self.logger = logger + + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", validate_dataset: Optional[Dict[str, Any]] = None, iterations: int = 10, population_size: Optional[int] = None, num_islands: Optional[int] = None, seed: Optional[int] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, verbose: Union[bool, str] = False, model: Optional[str] = None, api_base: Optional[str] = None, api_key: Optional[str] = None, api_key_env: str = "OPENAI_API_KEY", max_tokens: int = 4096, temperature: Optional[float] = 0.7, output_dir: Optional[str] = None, cleanup: bool = True, **_kwargs: Any) -> Dict[str, Any]: + """Optimize Trace parameters with OpenEvolve via a literal candidate mapping.""" + if mode not in {"real", "stub"}: + raise ValueError("mode must be either 'real' or 'stub'.") + if iterations < 1: + raise ValueError("iterations must be at least 1.") + if mode == "stub": + return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"} + + parameters = collect_trainable_parameters(self.param) + evaluation_dataset = validate_dataset or train_dataset + baseline_snapshot = snapshot_parameter_values(parameters) + baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + + def evaluator(candidate_path: str) -> Dict[str, Any]: + program_text = Path(candidate_path).read_text(encoding="utf-8") + try: + update_dict = _parse_candidate_program(program_text, parameters) + except (TypeError, ValueError) as exc: + return {"score": float("-inf"), "combined_score": float("-inf"), "feedback": str(exc)} + with _EVALUATION_LOCK: + snapshot = snapshot_parameter_values(parameters) + try: + apply_parameter_updates(update_dict) + score, feedbacks = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + finally: + restore_parameter_values(snapshot) + return {"score": score, "combined_score": score, "feedback": summarize_feedback(feedbacks), "artifacts": {"candidate": {parameter.py_name: value for parameter, value in update_dict.items()}}} + + initial_program = _serialize_candidate_program(parameters) + config = _build_openevolve_config(model=model, api_base=api_base, api_key=api_key, api_key_env=api_key_env, max_tokens=max_tokens, temperature=temperature, iterations=iterations, seed=seed, population_size=population_size, num_islands=num_islands) + run_kwargs = {"iterations": iterations, "population_size": population_size, "num_islands": num_islands, "seed": seed, "verbose": verbose if isinstance(verbose, bool) else False, "config": config, "output_dir": output_dir, "cleanup": cleanup} + filtered_kwargs = _filter_supported_kwargs(_run_evolution, {key: value for key, value in run_kwargs.items() if value is not None}) + result = _run_evolution_compatible( + {"initial_program": initial_program, "evaluator": partial(evaluator), **filtered_kwargs} + ) + + best_code = _extract_best_code(result) + best_update = _parse_candidate_program(best_code, parameters) + apply_parameter_updates(best_update) + if ensure_improvement: + candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + if candidate_score < baseline_score + improvement_threshold: + restore_parameter_values(baseline_snapshot) + + return {"status": "ok", "resolved_optimizer": "openevolve.run_evolution"} diff --git a/trace_bench/trainers/textgrad_trainer.py b/trace_bench/trainers/textgrad_trainer.py new file mode 100644 index 0000000..4bad679 --- /dev/null +++ b/trace_bench/trainers/textgrad_trainer.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, Union + +from opto import trace + +try: + from opto.optimizers.textgrad import TextGrad as _TraceTextGrad +except Exception as exc: + raise ImportError("TextGradTrainer requires opto.optimizers.textgrad from the NewTrace fork.") from exc + +from trace_bench.trainers._external_utils import ( + apply_parameter_updates, + collect_trainable_parameters, + coerce_like, + resolve_external_trainer_base, + restore_parameter_values, + score_model_on_dataset, + snapshot_parameter_values, +) + + +_TrainerBase = resolve_external_trainer_base() + + +class TextGradTrainer(_TrainerBase): + """Trace-Bench wrapper around the Trace-native TextGrad optimizer from NewTrace.""" + + USES_TRACE_OPTIMIZER = False + + def __init__(self, agent: Any, optimizer: Any = None, logger: Any = None, **_kwargs: Any) -> None: + del optimizer + self.param = agent + self.logger = logger + + def _normalize_updates(self, update_dict: Dict[Any, Any]) -> Dict[Any, Any]: + """Coerce proposed values back to the current parameter types.""" + normalized: Dict[Any, Any] = {} + for parameter, candidate_value in update_dict.items(): + normalized[parameter] = coerce_like(getattr(parameter, "data"), candidate_value) + return normalized + + def _standard_optimization_step(self, guide: Any, task_input: Any, task_info: Any, min_score: float) -> tuple[Any, float, Any]: + """Run one forward/feedback step, preserving Trace execution errors as feedback.""" + try: + target = self.param(task_input) + response = getattr(target, "data", target) + score, feedback = guide(task_input, response, task_info) + return target, float(score), feedback + except trace.ExecutionError as exc: + target = exc.exception_node + return target, float(min_score), target.create_feedback("full") + + def train(self, guide: Any, train_dataset: Dict[str, Any], *, mode: str = "real", num_epochs: int = 1, batch_size: int = 1, min_score: float = 0.0, validate_dataset: Optional[Dict[str, Any]] = None, ensure_improvement: bool = True, improvement_threshold: float = 0.0, max_tokens: int = 4096, llm: Any = None, verbose: Union[bool, str] = False, **_kwargs: Any) -> Dict[str, Any]: + """Optimize Trace parameters with the TextGrad optimizer provided by NewTrace.""" + if mode not in {"real", "stub"}: + raise ValueError("mode must be either 'real' or 'stub'.") + if num_epochs < 1: + raise ValueError("num_epochs must be at least 1.") + if batch_size < 1: + raise ValueError("batch_size must be at least 1.") + if mode == "stub": + return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"} + + parameters = collect_trainable_parameters(self.param) + inputs = train_dataset.get("inputs") or [] + infos = train_dataset.get("infos") or train_dataset.get("info") or [] + if len(inputs) != len(infos): + raise ValueError("train_dataset 'inputs' and 'infos' must have the same length.") + if not inputs: + raise ValueError("train_dataset must contain at least one example.") + + optimizer = _TraceTextGrad(parameters=parameters, max_tokens=max_tokens, llm=llm) + for _ in range(num_epochs): + for start in range(0, len(inputs), batch_size): + batch_inputs = inputs[start : start + batch_size] + batch_infos = infos[start : start + batch_size] + evaluation_dataset = validate_dataset or {"inputs": batch_inputs, "infos": batch_infos} + optimizer.zero_feedback() + for task_input, task_info in zip(batch_inputs, batch_infos): + target, _score, feedback = self._standard_optimization_step(guide=guide, task_input=task_input, task_info=task_info, min_score=min_score) + optimizer.backward(target, feedback) + + proposal = optimizer.step(bypassing=True, verbose=verbose) + normalized = self._normalize_updates(proposal) + if not normalized: + continue + + snapshot = snapshot_parameter_values(parameters) + baseline_score: Optional[float] = None + if ensure_improvement: + baseline_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + + apply_parameter_updates(normalized) + if ensure_improvement and baseline_score is not None: + candidate_score, _ = score_model_on_dataset(agent=self.param, guide=guide, dataset=evaluation_dataset, suppress_exceptions=True) + if candidate_score < baseline_score + improvement_threshold: + restore_parameter_values(snapshot) + + return {"status": "ok", "resolved_optimizer": "opto.optimizers.textgrad.TextGrad"}