calibrate method

required List<HumanLabeledTrial> goldenSet,
required JudgeScorer judgeScorer,
int concurrency = 4,
})
Implementation

Future<CalibrationReport> calibrate({
  required List<HumanLabeledTrial> goldenSet,
  required JudgeScorer judgeScorer,
  int concurrency = 4,
}) async {
  if (goldenSet.isEmpty) {
    throw ArgumentError('goldenSet must not be empty');
  }

  // 并发拉 judge 分数（每个 trial 最多一次 judge 调用）。
  final results = List<_PairedScore?>.filled(goldenSet.length, null);
  final iterator = goldenSet.asMap().entries.iterator;

  Future<void> worker() async {
    while (iterator.moveNext()) {
      final entry = iterator.current;
      final i = entry.key;
      final labeled = entry.value;
      try {
        final js = await judgeScorer(labeled);
        if (js != null) {
          results[i] = _PairedScore(
            labeled: labeled,
            judgeValue: js.value,
            judgeRationale: js.rationale,
          );
        }
      } catch (e, st) {
        // 失败的样本被忽略（不计入相关性）。
        // 业务方可在 scorer 内做容错或返回 null。
        assert(() {
          print('judge scorer failed for ${labeled.trialId}: $e\n$st');
          return true;
        }());
      }
    }
  }

  await Future.wait(List.generate(concurrency, (_) => worker()));

  final valid = results.whereType<_PairedScore>().toList();
  if (valid.length < 2) {
    throw StateError(
      'Need at least 2 valid (judge, human) pairs to compute correlation. '
      'Got ${valid.length}.',
    );
  }

  final humanValues = valid.map((p) => p.labeled.humanScore).toList();
  final judgeValues = valid.map((p) => p.judgeValue).toList();

  final spearman = _spearman(humanValues, judgeValues);
  final pearson = _pearson(humanValues, judgeValues);

  final tolerance = config.agreementTolerance;
  final agreementCount = valid
      .where((p) => (p.labeled.humanScore - p.judgeValue).abs() <= tolerance)
      .length;
  final mae =
      valid
          .map((p) => (p.labeled.humanScore - p.judgeValue).abs())
          .fold<double>(0, (a, b) => a + b) /
      valid.length;

  final disagreements =
      valid
          .map(
            (p) => TrialDisagreement(
              trialId: p.labeled.trialId,
              humanScore: p.labeled.humanScore,
              judgeScore: p.judgeValue,
              absoluteDelta: (p.labeled.humanScore - p.judgeValue).abs(),
              humanRationale: p.labeled.humanRationale,
              judgeRationale: p.judgeRationale,
            ),
          )
          .toList()
        ..sort((a, b) => b.absoluteDelta.compareTo(a.absoluteDelta));

  final topDisagreements = disagreements
      .take(config.topDisagreements)
      .toList();

  return CalibrationReport(
    spearmanCorrelation: spearman,
    pearsonCorrelation: pearson,
    agreementRate: agreementCount / valid.length,
    meanAbsoluteError: mae,
    samples: valid.length,
    agreementCount: agreementCount,
    disagreementCount: valid.length - agreementCount,
    disagreements: topDisagreements,
  );
}
calibrate method

Implementation

JudgeCalibratorOps extension on JudgeCalibrator