normalizePunctuationSpacing function
Normalizes punctuation spacing errors common in OCR.
Implementation
String normalizePunctuationSpacing(String text) {
// Fixes "word . next" -> "word. next"
String result = text.replaceAllMapped(RegExp(r'\s+([.,!?;:])'), (match) {
return match.group(regexGroupFirst)!;
});
// Fixes "word.next" -> "word. next" but not "www.AMAZON" (domain-like).
// Skip inserting a space when the punctuation is a dot preceded by a
// letter AND followed by a letter (domain, URL, or abbreviation pattern).
// Numbered lists like "1.Hello" still get a space since the dot follows a digit.
result = result.replaceAllMapped(RegExp(r'([.,!?;:])([A-Za-z])'), (match) {
final String punct = match.group(regexGroupFirst)!;
final String letter = match.group(regexGroupSecond)!;
if (punct == '.' && match.start > 0) {
final int prevCode = result.codeUnitAt(match.start - 1);
if (isLetter(prevCode)) {
return '$punct$letter';
}
}
return '$punct $letter';
});
return result;
}