runSuite method
Run all tasks in suite, honoring concurrency and per-task
trialsPerRun. Returns the aggregated report.
Implementation
Future<EvalRunReport> runSuite({
required String runName,
required EvalSuite suite,
int concurrency = 8,
int? trialsOverride,
bool Function(EvalTask)? filter,
}) async {
final problems = suite.validate();
if (problems.isNotEmpty) {
throw StateError(
'Invalid suite "${suite.name}":\n${problems.join('\n')}',
);
}
final tasks = filter == null
? suite.tasks
: suite.tasks.where(filter).toList();
// Build the work queue: (task, trialIndex) pairs.
final queue = <_PlannedTrial>[];
for (final task in tasks) {
final trialsPerRun = trialsOverride ?? task.trialsPerRun;
for (var i = 0; i < trialsPerRun; i++) {
queue.add(_PlannedTrial(task: task, trialIndex: i));
}
}
final results = <TrialResult>[];
final startedAt = DateTime.now();
// Bounded concurrency via a simple semaphore over an async queue.
final iterator = queue.iterator;
final pool = List<Future<void>>.generate(
concurrency,
(_) => _worker(
iterator: iterator,
suite: suite,
runName: runName,
results: results,
),
);
await Future.wait(pool);
final endedAt = DateTime.now();
// Run-level aggregate scores: report top-line metrics.
final report = EvalRunReport(
runName: runName,
suite: suite,
trials: results,
startedAt: startedAt,
endedAt: endedAt,
);
final aggregateScores = <String, double>{
'task_pass_rate': report.taskPassRate,
'trial_pass_rate': report.trialPassRate,
...report.graderMeans.map((k, v) => MapEntry('grader_mean.$k', v)),
};
await exporter.onRunEnd(
runName: runName,
suiteName: suite.name,
aggregateScores: aggregateScores,
);
await exporter.dispose();
if (recordingStore != null) await recordingStore!.flush();
if (reportStore != null) await reportStore!.save(report);
return report;
}