normalizeRepeatedCommaSuffix function

String normalizeRepeatedCommaSuffix(
  1. String line
)

Splits all-caps merchant/location tokens that repeat the comma suffix.

OCR sometimes drops the space before a trailing location token in lines like EUROLOJAMATOSINHOS, MATOSINHOS. When the final token before the comma is all-caps and redundantly ends with the same all-caps word that starts the comma suffix, insert the missing space.

Implementation

String normalizeRepeatedCommaSuffix(String line) {
  final int commaIndex = line.indexOf(',');
  if (commaIndex <= 0 || commaIndex == line.length - 1) {
    return line;
  }

  final String before = line.substring(0, commaIndex).trimRight();
  final String after = line.substring(commaIndex + 1).trimLeft();
  if (before.isEmpty || after.isEmpty) {
    return line;
  }

  final Match? suffixMatch = RegExp(r'^([A-Z]{4,})(?:\b|$)').firstMatch(after);
  if (suffixMatch == null) {
    return line;
  }

  final String suffix = suffixMatch.group(1) ?? '';
  if (suffix.length < _repeatedCommaSuffixMinLength) {
    return line;
  }

  final Match? lastTokenMatch = RegExp(r'([A-Z]+)$').firstMatch(before);
  if (lastTokenMatch == null) {
    return line;
  }

  final String lastToken = lastTokenMatch.group(1) ?? '';
  if (lastToken == suffix || !lastToken.endsWith(suffix)) {
    return line;
  }

  final String stem = lastToken.substring(0, lastToken.length - suffix.length);
  if (stem.length < _repeatedCommaSuffixMinStemLength) {
    return line;
  }

  final String prefix = before.substring(0, lastTokenMatch.start);
  return '$prefix$stem $suffix, $after';
}