calibrate method
Future<CalibrationReport>
calibrate({
- required List<
HumanLabeledTrial> goldenSet, - required JudgeScorer judgeScorer,
- int concurrency = 4,
Implementation
Future<CalibrationReport> calibrate({
required List<HumanLabeledTrial> goldenSet,
required JudgeScorer judgeScorer,
int concurrency = 4,
}) async {
if (goldenSet.isEmpty) {
throw ArgumentError('goldenSet must not be empty');
}
// 并发拉 judge 分数(每个 trial 最多一次 judge 调用)。
final results = List<_PairedScore?>.filled(goldenSet.length, null);
final iterator = goldenSet.asMap().entries.iterator;
Future<void> worker() async {
while (iterator.moveNext()) {
final entry = iterator.current;
final i = entry.key;
final labeled = entry.value;
try {
final js = await judgeScorer(labeled);
if (js != null) {
results[i] = _PairedScore(
labeled: labeled,
judgeValue: js.value,
judgeRationale: js.rationale,
);
}
} catch (e, st) {
// 失败的样本被忽略(不计入相关性)。
// 业务方可在 scorer 内做容错或返回 null。
assert(() {
print('judge scorer failed for ${labeled.trialId}: $e\n$st');
return true;
}());
}
}
}
await Future.wait(List.generate(concurrency, (_) => worker()));
final valid = results.whereType<_PairedScore>().toList();
if (valid.length < 2) {
throw StateError(
'Need at least 2 valid (judge, human) pairs to compute correlation. '
'Got ${valid.length}.',
);
}
final humanValues = valid.map((p) => p.labeled.humanScore).toList();
final judgeValues = valid.map((p) => p.judgeValue).toList();
final spearman = _spearman(humanValues, judgeValues);
final pearson = _pearson(humanValues, judgeValues);
final tolerance = config.agreementTolerance;
final agreementCount = valid
.where((p) => (p.labeled.humanScore - p.judgeValue).abs() <= tolerance)
.length;
final mae =
valid
.map((p) => (p.labeled.humanScore - p.judgeValue).abs())
.fold<double>(0, (a, b) => a + b) /
valid.length;
final disagreements =
valid
.map(
(p) => TrialDisagreement(
trialId: p.labeled.trialId,
humanScore: p.labeled.humanScore,
judgeScore: p.judgeValue,
absoluteDelta: (p.labeled.humanScore - p.judgeValue).abs(),
humanRationale: p.labeled.humanRationale,
judgeRationale: p.judgeRationale,
),
)
.toList()
..sort((a, b) => b.absoluteDelta.compareTo(a.absoluteDelta));
final topDisagreements = disagreements
.take(config.topDisagreements)
.toList();
return CalibrationReport(
spearmanCorrelation: spearman,
pearsonCorrelation: pearson,
agreementRate: agreementCount / valid.length,
meanAbsoluteError: mae,
samples: valid.length,
agreementCount: agreementCount,
disagreementCount: valid.length - agreementCount,
disagreements: topDisagreements,
);
}