normalizeShortUppercaseDictionaryWords function

String normalizeShortUppercaseDictionaryWords(
  1. String line
)

Lowercases short all-caps dictionary words inside sentence-like lines.

This targets OCR leftovers like IS in prose lines such as Your balance IS now 0.00 USD. without touching title-case phrases, mixed-case brands, or longer acronyms such as USD.

Implementation

String normalizeShortUppercaseDictionaryWords(String line) {
  if (line.isEmpty || hasCodeLikeToken(line)) {
    return line;
  }

  final Iterable<Match> matches = RegExp(r'[A-Za-z]+').allMatches(line);
  int lowercaseTokenCount = 0;
  int titleCaseLikeTokenCount = 0;
  for (final Match match in matches) {
    final String token = match.group(0)!;
    if (token == token.toLowerCase()) {
      lowercaseTokenCount++;
    } else if (isTitleCaseWord(token) || isMixedCase(token)) {
      titleCaseLikeTokenCount++;
    }
  }

  if (lowercaseTokenCount < _sentenceLikeLowercaseTokenMinCount ||
      titleCaseLikeTokenCount > _sentenceLikeTitleCaseTokenMaxCount) {
    return line;
  }

  return line.replaceAllMapped(RegExp(r'\b([A-Z]{2})\b'), (Match match) {
    final String token = match.group(regexGroupFirst)!;
    if (token.length != _shortUppercaseDictionaryTokenLength ||
        !englishWords.contains(token.toLowerCase())) {
      return token;
    }
    return token.toLowerCase();
  });
}