run method

Runs every case in cases and aggregates the metrics.

Cases are processed in list order and matching is deterministic, so the result is stable across runs.

Implementation

Future<CorpusMetrics> run(List<CorpusCase> cases) async {
  final byRule = <String, RuleMetrics>{};
  final warnings = <String>[];

  RuleMetrics metricsFor(String ruleId) =>
      byRule.putIfAbsent(ruleId, () => RuleMetrics(ruleId));

  for (final testCase in cases) {
    final snapshot = await Future<SemanticsSnapshot>.value(testCase.build());
    final findings = await Future<List<Finding>>.value(
      _auditor(snapshot, testCase.standard),
    );

    // Only findings from the rule under test count for an isolated case;
    // realWorld cases score every rule.
    final inScope = testCase.isIsolated
        ? findings.where((f) => f.ruleId == testCase.ruleUnderTest).toList()
        : List<Finding>.of(findings);

    final consumed = <Finding>{};
    for (final expected in testCase.expected) {
      final match = inScope.firstWhereOrNull(
        (f) =>
            !consumed.contains(f) &&
            f.ruleId == expected.ruleId &&
            f.criterion.wcag == expected.wcag &&
            f.identifier == expected.identifier,
      );
      if (match != null) {
        metricsFor(expected.ruleId).truePositives++;
        consumed.add(match);
      } else {
        metricsFor(expected.ruleId).falseNegatives++;
        // Loud authoring signal: a same-rule/criterion finding exists but is
        // not anchored to the expected identifier (often a missing
        // `identifier:` on the fixture's offender).
        final unanchored = inScope.any(
          (f) =>
              f.ruleId == expected.ruleId &&
              f.criterion.wcag == expected.wcag &&
              f.identifier != expected.identifier,
        );
        if (unanchored) {
          warnings.add(
            '${testCase.id}: expected ${expected.ruleId} on '
            '"${expected.identifier}" but the matching finding resolved to a '
            'different identifier — check the fixture\'s identifiers.',
          );
        }
      }
    }

    // Every leftover in-scope finding is a false positive.
    for (final finding in inScope) {
      if (consumed.contains(finding)) continue;
      final metrics = metricsFor(finding.ruleId);
      metrics.falsePositives++;
      if (testCase.category == CorpusCategory.clean) {
        metrics.falsePositivesOnClean++;
      }
    }

    // Count clean cases per rule so fpRateOnClean has a denominator.
    if (testCase.category == CorpusCategory.clean &&
        testCase.ruleUnderTest != null) {
      metricsFor(testCase.ruleUnderTest!).cleanCases++;
    }
  }

  return CorpusMetrics(
    byRule: Map.unmodifiable(byRule),
    warnings: List.unmodifiable(warnings),
  );
}