runSuite method

Future<EvalRunReport> runSuite({
  1. required String runName,
  2. required EvalSuite suite,
  3. int concurrency = 8,
  4. int? trialsOverride,
  5. bool filter(
    1. EvalTask
    )?,
})

Run all tasks in suite, honoring concurrency and per-task trialsPerRun. Returns the aggregated report.

Implementation

Future<EvalRunReport> runSuite({
  required String runName,
  required EvalSuite suite,
  int concurrency = 8,
  int? trialsOverride,
  bool Function(EvalTask)? filter,
}) async {
  final problems = suite.validate();
  if (problems.isNotEmpty) {
    throw StateError(
      'Invalid suite "${suite.name}":\n${problems.join('\n')}',
    );
  }

  final tasks = filter == null
      ? suite.tasks
      : suite.tasks.where(filter).toList();

  // Build the work queue: (task, trialIndex) pairs.
  final queue = <_PlannedTrial>[];
  for (final task in tasks) {
    final trialsPerRun = trialsOverride ?? task.trialsPerRun;
    for (var i = 0; i < trialsPerRun; i++) {
      queue.add(_PlannedTrial(task: task, trialIndex: i));
    }
  }

  final results = <TrialResult>[];
  final startedAt = DateTime.now();

  // Bounded concurrency via a simple semaphore over an async queue.
  final iterator = queue.iterator;
  final pool = List<Future<void>>.generate(
    concurrency,
    (_) => _worker(
      iterator: iterator,
      suite: suite,
      runName: runName,
      results: results,
    ),
  );
  await Future.wait(pool);

  final endedAt = DateTime.now();

  // Run-level aggregate scores: report top-line metrics.
  final report = EvalRunReport(
    runName: runName,
    suite: suite,
    trials: results,
    startedAt: startedAt,
    endedAt: endedAt,
  );

  final aggregateScores = <String, double>{
    'task_pass_rate': report.taskPassRate,
    'trial_pass_rate': report.trialPassRate,
    ...report.graderMeans.map((k, v) => MapEntry('grader_mean.$k', v)),
  };

  await exporter.onRunEnd(
    runName: runName,
    suiteName: suite.name,
    aggregateScores: aggregateScores,
  );
  await exporter.dispose();
  if (recordingStore != null) await recordingStore!.flush();
  if (reportStore != null) await reportStore!.save(report);

  return report;
}