run method
Runs every case in cases and aggregates the metrics.
Cases are processed in list order and matching is deterministic, so the result is stable across runs.
Implementation
Future<CorpusMetrics> run(List<CorpusCase> cases) async {
final byRule = <String, RuleMetrics>{};
final warnings = <String>[];
RuleMetrics metricsFor(String ruleId) =>
byRule.putIfAbsent(ruleId, () => RuleMetrics(ruleId));
for (final testCase in cases) {
final snapshot = await Future<SemanticsSnapshot>.value(testCase.build());
final findings = await Future<List<Finding>>.value(
_auditor(snapshot, testCase.standard),
);
// Only findings from the rule under test count for an isolated case;
// realWorld cases score every rule.
final inScope = testCase.isIsolated
? findings.where((f) => f.ruleId == testCase.ruleUnderTest).toList()
: List<Finding>.of(findings);
final consumed = <Finding>{};
for (final expected in testCase.expected) {
final match = inScope.firstWhereOrNull(
(f) =>
!consumed.contains(f) &&
f.ruleId == expected.ruleId &&
f.criterion.wcag == expected.wcag &&
f.identifier == expected.identifier,
);
if (match != null) {
metricsFor(expected.ruleId).truePositives++;
consumed.add(match);
} else {
metricsFor(expected.ruleId).falseNegatives++;
// Loud authoring signal: a same-rule/criterion finding exists but is
// not anchored to the expected identifier (often a missing
// `identifier:` on the fixture's offender).
final unanchored = inScope.any(
(f) =>
f.ruleId == expected.ruleId &&
f.criterion.wcag == expected.wcag &&
f.identifier != expected.identifier,
);
if (unanchored) {
warnings.add(
'${testCase.id}: expected ${expected.ruleId} on '
'"${expected.identifier}" but the matching finding resolved to a '
'different identifier — check the fixture\'s identifiers.',
);
}
}
}
// Every leftover in-scope finding is a false positive.
for (final finding in inScope) {
if (consumed.contains(finding)) continue;
final metrics = metricsFor(finding.ruleId);
metrics.falsePositives++;
if (testCase.category == CorpusCategory.clean) {
metrics.falsePositivesOnClean++;
}
}
// Count clean cases per rule so fpRateOnClean has a denominator.
if (testCase.category == CorpusCategory.clean &&
testCase.ruleUnderTest != null) {
metricsFor(testCase.ruleUnderTest!).cleanCases++;
}
}
return CorpusMetrics(
byRule: Map.unmodifiable(byRule),
warnings: List.unmodifiable(warnings),
);
}