Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,26 @@ pip install -e .

---

## 🧪 Evaluation & Baseline Report

`trainer_config.json` 支持开启训练后自动生成的对比报告:
```json
{
"report_config": {
"enable": true,
"sample_size": 8,
"success_threshold": 0.5,
"report_name": "report.json"
}
}
```
启用后,训练会自动:
- 在日志目录下运行基线评估(不更新提示)并保存到 `logs/<timestamp>/baseline/`
- 使用最新方案运行对比评估到 `logs/<timestamp>/final/`
- 生成一页报告 `logs/<timestamp>/report.json`,包含平均得分/成功率、得分/损失轨迹和主要反思片段

---

## ⭐ Star History
[![Star History Chart](https://api.star-history.com/svg?repos=aiwaves-cn/agents&type=Date)](https://star-history.com/#aiwaves-cn/agents&Date)

Expand Down
1 change: 1 addition & 0 deletions src/agents/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
# limitations under the License.

from .case import *
from .report import EvaluationReporter, ReportConfig
136 changes: 136 additions & 0 deletions src/agents/evaluation/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional


class ReportConfig:
"""
Configuration holder for evaluation report generation.
"""

def __init__(self, config_dict: Optional[dict] = None):
cfg = config_dict or {}
self.enable: bool = cfg.get("enable", False)
self.sample_size: int = cfg.get("sample_size", 8)
self.eval_indices: Optional[List[int]] = cfg.get("eval_indices", None)
self.success_threshold: Optional[float] = cfg.get("success_threshold", None)
self.top_k_reflections: int = cfg.get("top_k_reflections", 5)
self.report_name: str = cfg.get("report_name", "report.json")
# optional fixed baseline solution; fall back to initial solution
self.baseline_solution_path: Optional[str] = cfg.get("baseline_solution_path", None)


class EvaluationReporter:
"""
Utility class to aggregate evaluation results and emit a compact report.
"""

def __init__(self, report_config: ReportConfig):
self.report_config = report_config

def _load_cases_from_dir(self, case_dir: Path) -> List[Dict[str, Any]]:
if not case_dir.exists():
return []
case_dicts: List[Dict[str, Any]] = []
for file in case_dir.iterdir():
if file.suffix != ".json":
continue
with open(file, "r", encoding="utf-8") as f:
try:
case_dicts.append(json.load(f))
except json.JSONDecodeError:
continue
return case_dicts

def _summarize_cases(self, case_dicts: List[Dict[str, Any]]) -> Dict[str, Any]:
scores: List[float] = []
loss_scores: List[float] = []
for case in case_dicts:
dataset_eval = case.get("dataset_eval", {})
score = dataset_eval.get("score")
if isinstance(score, (int, float)):
scores.append(float(score))

loss_score = case.get("loss", {}).get("score")
if isinstance(loss_score, (int, float)):
loss_scores.append(float(loss_score))

avg_score = sum(scores) / len(scores) if scores else None
avg_loss = sum(loss_scores) / len(loss_scores) if loss_scores else None

success_rate = None
if scores:
if self.report_config.success_threshold is not None:
success_rate = sum(1 for s in scores if s >= self.report_config.success_threshold) / len(scores)
elif all(s in (0, 1) for s in scores):
success_rate = sum(scores) / len(scores)

return {
"num_cases": len(case_dicts),
"average_score": avg_score,
"average_loss": avg_loss,
"success_rate": success_rate,
}

def extract_reflections(self, case_dicts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Get top reflection snippets from loss / sop suggestions.
"""
reflections: List[Dict[str, Any]] = []
for case in case_dicts:
loss_info = case.get("loss", {})
requirement = loss_info.get("requirement_for_previous")
if requirement:
reflections.append(
{
"case_id": case.get("case_id"),
"dataset_score": case.get("dataset_eval", {}).get("score"),
"loss_score": loss_info.get("score"),
"requirement_for_previous": requirement,
}
)
reflections.sort(
key=lambda x: (
x.get("dataset_score") if isinstance(x.get("dataset_score"), (int, float)) else -1,
x.get("loss_score") if isinstance(x.get("loss_score"), (int, float)) else -1,
)
)
return reflections[: self.report_config.top_k_reflections]

def build_report(
self,
baseline_summary: Optional[Dict[str, Any]],
final_summary: Dict[str, Any],
training_curve: List[Dict[str, Any]],
reflections: List[Dict[str, Any]],
) -> Dict[str, Any]:
delta_score = None
if baseline_summary and baseline_summary.get("average_score") is not None and final_summary.get("average_score") is not None:
delta_score = final_summary["average_score"] - baseline_summary["average_score"]

return {
"generated_at": datetime.utcnow().isoformat() + "Z",
"baseline": baseline_summary,
"final": final_summary,
"delta_average_score": delta_score,
"training_curve": training_curve,
"top_reflections": reflections,
}

def summarize_dir(self, case_dir: Path) -> Dict[str, Any]:
case_dicts = self._load_cases_from_dir(case_dir)
summary = self._summarize_cases(case_dicts)
summary["case_dir"] = str(case_dir)
return summary

def collect_reflections_from_dir(self, case_dir: Path) -> List[Dict[str, Any]]:
case_dicts = self._load_cases_from_dir(case_dir)
return self.extract_reflections(case_dicts)

@staticmethod
def save_report(report: Dict[str, Any], save_path: Path):
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, "w", encoding="utf-8") as f:
json.dump(report, f, ensure_ascii=False, indent=4)

84 changes: 81 additions & 3 deletions src/agents/optimization/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime
from pathlib import Path
import random
from typing import Optional

import wandb

Expand All @@ -18,6 +19,7 @@
ToolkitOptimizer,
)
from agents.evaluation.case import Case
from agents.evaluation.report import EvaluationReporter, ReportConfig
from agents.optimization.utils import OptimUtils
from agents.datasets import BaseDataset
from .. import Solution, SolutionConfig
Expand All @@ -37,6 +39,7 @@ def __init__(self, config_path_or_dict):
self.optim_order = self.config_dict["optim_order"]
self.optimizers = self.config_dict["optimizers"]
self.additional_info = self.config_dict.get("additional_info", {})
self.report_config = self.config_dict.get("report_config", {})

# initial solution and optimizer config
self.initial_solution_path = self.config_dict["initial_solution_path"]
Expand Down Expand Up @@ -103,6 +106,7 @@ def __init__(self, config: TrainerConfig, dataset: BaseDataset):
self.sample_kind: str = config.sample_kind
self.allow_duplicate_samples: bool = config.allow_duplicate_samples
self.sampled_idx_set = set()
self.initial_solution_path = config.initial_solution_path

# task setting
self.has_ground_truth = config.has_ground_truth
Expand Down Expand Up @@ -169,8 +173,21 @@ def __init__(self, config: TrainerConfig, dataset: BaseDataset):
)

# others
self.initial_solution_path = config.initial_solution_path
self.exceed_threshold_times = 0
self.report_config = ReportConfig(config.report_config)
self.reporter = EvaluationReporter(self.report_config)
self.report_eval_indices = self._prepare_report_indices()
self.training_curve: list[dict] = []
self.baseline_summary = None
self.last_step_path: Optional[Path] = None

if self.report_config.enable:
baseline_solution_path = self.report_config.baseline_solution_path or self.initial_solution_path
baseline_solution = Solution(config=SolutionConfig(baseline_solution_path))
baseline_dir = self.log_path / self.time_path / "baseline"
baseline_dir.mkdir(parents=True, exist_ok=True)
self.baseline_summary = self._run_eval_for_report(baseline_solution, baseline_dir)
self.logger.info(f"Baseline evaluation saved at {baseline_dir}")

def get_step_optim_order(self, last_optim_order: list[str]) -> list[str]:
"""
Expand Down Expand Up @@ -310,25 +327,44 @@ def train(self):
case_list, solution, save_step_path
)

step_loss = self._compute_step_loss(case_list)

# record some information
self.logger.info(f"step {step} optim status: {op_status}")
case_count += self.batch_size
step_cost_time.append(time.time() - step_start_time)
self.logger.info(f"step {step} cost time: {step_cost_time[-1] / 60:.1f}min")
self.training_curve.append(
{"step": step, "avg_score": ave_score_list[-1], "avg_loss": step_loss}
)
self.last_step_path = save_step_path

# log the result of this step to wandb
wandb.log({
log_payload = {
"step": step,
"score_before_optim": ave_score_list[-1],
"scores": scores,
"cost_time": step_cost_time[-1],
"case_count": case_count,
"optim_status": op_status,
})
}
if step_loss is not None:
log_payload["avg_loss"] = step_loss
wandb.log(log_payload)

self.logger.info(
f"Training completed. Total time spent: {sum(step_cost_time) / 60:.1f} minutes. There were {case_count} cases and {step} steps.")

if self.report_config.enable:
final_dir = self.log_path / self.time_path / "final"
final_dir.mkdir(parents=True, exist_ok=True)
final_summary = self._run_eval_for_report(solution, final_dir)
reflections = self._collect_reflections_for_report(self.last_step_path)
report = self.reporter.build_report(self.baseline_summary, final_summary, self.training_curve, reflections)
report_path = self.log_path / self.time_path / self.report_config.report_name
self.reporter.save_report(report, report_path)
self.logger.info(f"Evaluation report saved to {report_path}")

wandb.finish()

def sample_case_list(self, sample_kind: str, from_idx: int):
Expand Down Expand Up @@ -468,6 +504,48 @@ def roll_back(
self.logger.debug("no need to roll back")
return solution, finished_case_list

def _compute_step_loss(self, case_list: list[Case]) -> Optional[float]:
loss_scores = [
case.loss.score
for case in case_list
if hasattr(case, "loss") and isinstance(getattr(case.loss, "score", None), (int, float))
]
if not loss_scores:
return None
return sum(loss_scores) / len(loss_scores)

def _prepare_report_indices(self) -> list[int]:
if not self.report_config.enable:
return []
if self.report_config.eval_indices:
return [
idx for idx in self.report_config.eval_indices
if 0 <= idx < len(self.dataset)
]
sample_size = min(self.report_config.sample_size, len(self.dataset))
return list(range(sample_size))

def _run_eval_for_report(self, solution: Solution, save_dir: Path):
if not self.report_config.enable:
return None
indices = self.report_eval_indices or list(range(min(self.report_config.sample_size, len(self.dataset))))
case_list = [Case(self.dataset.get_case_dict(i)) for i in indices]
OptimUtils.parallel_case_forward(
case_list,
solution,
self.parallel_max_num,
save_dir,
self.dataset.evaluate,
self.logger,
)
return self.reporter.summarize_dir(save_dir)

def _collect_reflections_for_report(self, step_path: Optional[Path]):
if not self.report_config.enable or not step_path:
return []
case_dir = step_path / "case_after_loss"
return self.reporter.collect_reflections_from_dir(case_dir)


def setup_logging(save_log_path: Path):
"""
Expand Down