normalizePunctuationSpacing function

String normalizePunctuationSpacing(
  1. String text
)

Normalizes punctuation spacing errors common in OCR.

Implementation

String normalizePunctuationSpacing(String text) {
  // Fixes "word . next" -> "word. next"
  String result = text.replaceAllMapped(RegExp(r'\s+([.,!?;:])'), (match) {
    return match.group(regexGroupFirst)!;
  });

  // Fixes "word.next" -> "word. next" but not "www.AMAZON" (domain-like).
  // Skip inserting a space when the punctuation is a dot preceded by a
  // letter AND followed by a letter (domain, URL, or abbreviation pattern).
  // Numbered lists like "1.Hello" still get a space since the dot follows a digit.
  result = result.replaceAllMapped(RegExp(r'([.,!?;:])([A-Za-z])'), (match) {
    final String punct = match.group(regexGroupFirst)!;
    final String letter = match.group(regexGroupSecond)!;
    if (punct == '.' && match.start > 0) {
      final int prevCode = result.codeUnitAt(match.start - 1);
      if (isLetter(prevCode)) {
        return '$punct$letter';
      }
    }
    return '$punct $letter';
  });

  return result;
}