diffRunReports function
EvalRunDiff
diffRunReports({
- required EvalRunReport current,
- required EvalRunReport baseline,
- double significanceThreshold = 0.05,
计算两份 run report 的 diff。
Implementation
EvalRunDiff diffRunReports({
required EvalRunReport current,
required EvalRunReport baseline,
double significanceThreshold = 0.05,
}) {
final currByTask = current.trialsByTask();
final baseByTask = baseline.trialsByTask();
final shared = currByTask.keys.toSet().intersection(baseByTask.keys.toSet());
final transitions = <TaskTransition>[];
for (final id in shared) {
final cur = _passRate(currByTask[id]!);
final base = _passRate(baseByTask[id]!);
final delta = cur - base;
final t = delta.abs() < significanceThreshold
? TaskTransitionKind.unchanged
: (delta > 0
? TaskTransitionKind.improved
: TaskTransitionKind.regressed);
transitions.add(
TaskTransition(
taskId: id,
currentPassRate: cur,
baselinePassRate: base,
kind: t,
),
);
}
final added = currByTask.keys.toSet().difference(baseByTask.keys.toSet());
final removed = baseByTask.keys.toSet().difference(currByTask.keys.toSet());
final deltas = <String, double>{
'task_pass_rate': current.taskPassRate - baseline.taskPassRate,
'trial_pass_rate': current.trialPassRate - baseline.trialPassRate,
};
for (final entry in current.graderMeans.entries) {
final baseValue = baseline.graderMeans[entry.key];
if (baseValue != null) {
deltas['grader_mean.${entry.key}'] = entry.value - baseValue;
}
}
return EvalRunDiff(
current: current,
baseline: baseline,
transitions: transitions,
addedTaskIds: added.toList()..sort(),
removedTaskIds: removed.toList()..sort(),
metricDeltas: deltas,
);
}