generateMarkdownReport function

String generateMarkdownReport(
  1. EvalRunReport report, {
  2. Map<String, String>? taskBucketMap,
  3. SuiteHealthReport? health,
  4. List<int> ksToReport = const [1, 3],
})

生成一次 EvalRunReport 的 Markdown 总结。

Implementation

String generateMarkdownReport(
  EvalRunReport report, {
  Map<String, String>? taskBucketMap,
  SuiteHealthReport? health,
  List<int> ksToReport = const [1, 3],
}) {
  final b = StringBuffer();
  final byTask = report.trialsByTask();

  b.writeln('# Eval Run: `${report.runName}`');
  b.writeln();
  b.writeln(
    '- Suite: **${report.suite.name}** '
    '(${_kindLabel(report.suite.kind)})',
  );
  b.writeln('- Started: ${report.startedAt.toIso8601String()}');
  b.writeln('- Duration: ${_formatDuration(report.duration)}');
  b.writeln('- Tasks: ${byTask.length} · Trials: ${report.trials.length}');
  b.writeln();

  // Top-line metrics
  b.writeln('## Top-line metrics');
  b.writeln();
  b.writeln('| Metric | Value |');
  b.writeln('|---|---|');
  b.writeln('| Task pass rate | ${_pct(report.taskPassRate)} |');
  b.writeln('| Trial pass rate | ${_pct(report.trialPassRate)} |');
  for (final entry in report.graderMeans.entries) {
    b.writeln(
      '| Grader mean: `${entry.key}` | ${entry.value.toStringAsFixed(3)} |',
    );
  }
  b.writeln();

  // pass@k / pass^k
  final passAtK = report.passAtKByTask(ks: ksToReport);
  final passCK = report.passCaretKByTask(ks: ksToReport);
  if (passAtK.isNotEmpty && ksToReport.isNotEmpty) {
    b.writeln('## pass@k / pass^k by task');
    b.writeln();
    final headers = [
      'Task',
      ...ksToReport.expand((k) => ['pass@$k', 'pass^$k']),
    ];
    b.writeln('| ${headers.join(' | ')} |');
    b.writeln('|${'---|' * headers.length}');
    for (final taskId in passAtK.keys) {
      final cells = <String>[
        '`$taskId`',
        for (final k in ksToReport) ...[
          _pct(passAtK[taskId]?[k] ?? 0),
          _pct(passCK[taskId]?[k] ?? 0),
        ],
      ];
      b.writeln('| ${cells.join(' | ')} |');
    }
    b.writeln();
  }

  // Bucket pass rates
  if (taskBucketMap != null) {
    final bucketRates = report.bucketPassRates(taskBucketMap);
    if (bucketRates.isNotEmpty) {
      b.writeln('## Pass rate by failure bucket');
      b.writeln();
      b.writeln('| Bucket | Pass rate |');
      b.writeln('|---|---|');
      for (final entry in bucketRates.entries) {
        b.writeln('| ${entry.key} | ${_pct(entry.value)} |');
      }
      b.writeln();
    }
  }

  // Failed trials
  final failed = report.trials.where((t) => !t.allGradersPassed).toList();
  if (failed.isNotEmpty) {
    b.writeln('## Failed trials (${failed.length})');
    b.writeln();
    for (final tr in failed) {
      b.writeln('### `${tr.trial.taskId}` · trial #${tr.trial.trialIndex}');
      b.writeln();
      for (final s in tr.scores) {
        b.writeln(_renderScore(s));
      }
      b.writeln();
    }
  }

  // Health
  if (health != null) {
    b.writeln('## Suite health');
    b.writeln();
    b.writeln(
      '- Saturation: ${_pct(health.currentSaturationRatio)} '
      '${health.currentlySaturated ? "⚠️ saturated" : ""}',
    );
    if (health.graduationCandidates.isNotEmpty) {
      b.writeln('- Graduation candidates:');
      for (final c in health.graduationCandidates) {
        b.writeln(
          '  - `${c.taskId}` (mean ${_pct(c.recentMeanPassRate)}, '
          '${c.consecutiveMatureRuns} consecutive mature runs)',
        );
      }
    }
    if (health.brokenTaskCandidates.isNotEmpty) {
      b.writeln('- Broken task candidates (likely task/grader bugs):');
      for (final c in health.brokenTaskCandidates) {
        b.writeln(
          '  - `${c.taskId}` (${c.passedTrials}/${c.totalTrials} '
          'passed = ${_pct(c.passRate)})',
        );
      }
    }
    b.writeln();
  }

  return b.toString();
}