normalizeShortUppercaseDictionaryWords function
Lowercases short all-caps dictionary words inside sentence-like lines.
This targets OCR leftovers like IS in prose lines such as
Your balance IS now 0.00 USD. without touching title-case phrases,
mixed-case brands, or longer acronyms such as USD.
Implementation
String normalizeShortUppercaseDictionaryWords(String line) {
if (line.isEmpty || hasCodeLikeToken(line)) {
return line;
}
final Iterable<Match> matches = RegExp(r'[A-Za-z]+').allMatches(line);
int lowercaseTokenCount = 0;
int titleCaseLikeTokenCount = 0;
for (final Match match in matches) {
final String token = match.group(0)!;
if (token == token.toLowerCase()) {
lowercaseTokenCount++;
} else if (isTitleCaseWord(token) || isMixedCase(token)) {
titleCaseLikeTokenCount++;
}
}
if (lowercaseTokenCount < _sentenceLikeLowercaseTokenMinCount ||
titleCaseLikeTokenCount > _sentenceLikeTitleCaseTokenMaxCount) {
return line;
}
return line.replaceAllMapped(RegExp(r'\b([A-Z]{2})\b'), (Match match) {
final String token = match.group(regexGroupFirst)!;
if (token.length != _shortUppercaseDictionaryTokenLength ||
!englishWords.contains(token.toLowerCase())) {
return token;
}
return token.toLowerCase();
});
}