normalizeRepeatedCommaSuffix function
Splits all-caps merchant/location tokens that repeat the comma suffix.
OCR sometimes drops the space before a trailing location token in lines like
EUROLOJAMATOSINHOS, MATOSINHOS. When the final token before the comma is
all-caps and redundantly ends with the same all-caps word that starts the
comma suffix, insert the missing space.
Implementation
String normalizeRepeatedCommaSuffix(String line) {
final int commaIndex = line.indexOf(',');
if (commaIndex <= 0 || commaIndex == line.length - 1) {
return line;
}
final String before = line.substring(0, commaIndex).trimRight();
final String after = line.substring(commaIndex + 1).trimLeft();
if (before.isEmpty || after.isEmpty) {
return line;
}
final Match? suffixMatch = RegExp(r'^([A-Z]{4,})(?:\b|$)').firstMatch(after);
if (suffixMatch == null) {
return line;
}
final String suffix = suffixMatch.group(1) ?? '';
if (suffix.length < _repeatedCommaSuffixMinLength) {
return line;
}
final Match? lastTokenMatch = RegExp(r'([A-Z]+)$').firstMatch(before);
if (lastTokenMatch == null) {
return line;
}
final String lastToken = lastTokenMatch.group(1) ?? '';
if (lastToken == suffix || !lastToken.endsWith(suffix)) {
return line;
}
final String stem = lastToken.substring(0, lastToken.length - suffix.length);
if (stem.length < _repeatedCommaSuffixMinStemLength) {
return line;
}
final String prefix = before.substring(0, lastTokenMatch.start);
return '$prefix$stem $suffix, $after';
}