normalizeLetterConfusions function - post_process_text library

textify package
documentation
post_process_text.dart
normalizeLetterConfusions function

normalizeLetterConfusions function

String normalizeLetterConfusions(

String text

)

Normalizes common multi-character letter confusions within non-dictionary words.

OCR frequently confuses glyph sequences that look similar at low resolution: 'rn' → 'm', 'cl' → 'd', 'vv' → 'w', 'III' → 'm'. Only applies substitutions when the original token is NOT a valid dictionary word and the replacement IS, preventing damage to correct text.

Implementation

String normalizeLetterConfusions(String text) {
  const List<MapEntry<String, String>> confusions = [
    MapEntry('rn', 'm'),
    MapEntry('cl', 'd'),
    MapEntry('vv', 'w'),
    MapEntry('III', 'm'),
  ];

  return text.replaceAllMapped(RegExp(r'[A-Za-z]+'), (Match match) {
    final String token = match.group(0)!;
    final String lower = token.toLowerCase();

    // If the word is already valid, don't touch it.
    if (englishWords.contains(lower)) {
      return token;
    }

    // Try each confusion substitution and accept the first that yields
    // a valid dictionary word.
    for (final MapEntry<String, String> entry in confusions) {
      if (lower.contains(entry.key)) {
        final String candidate = lower.replaceAll(entry.key, entry.value);
        if (englishWords.contains(candidate)) {
          // Preserve original casing structure.
          if (token == token.toUpperCase()) {
            return candidate.toUpperCase();
          }
          if (isTitleCaseWord(token)) {
            return toTitleCaseWord(candidate);
          }
          return candidate;
        }
      }
    }

    return token;
  });
}