normalizeRegionPostalCodeSpacing function

String normalizeRegionPostalCodeSpacing(
  1. String line
)

Repairs split 5-digit postal codes after uppercase region abbreviations.

OCR can emit address lines like SAN FRANCISCO, CA941 05 or SAN FRANCISCO, CA 941 05. When a comma is followed by a two-letter uppercase region token and two digit chunks totaling five digits, keep one space before the postal code and remove the internal split.

Implementation

String normalizeRegionPostalCodeSpacing(String line) {
  if (line.isEmpty) {
    return line;
  }

  return line.replaceAllMapped(
    RegExp(r'(,\s*)([A-Z]{2})\s*(\d{2,3})\s+(\d{2,3})(?=\b)'),
    (Match match) {
      final String prefix = match.group(_postalPrefixGroup) ?? '';
      final String region = match.group(_postalRegionGroup) ?? '';
      final String zipLeft = match.group(_postalLeftChunkGroup) ?? '';
      final String zipRight = match.group(_postalRightChunkGroup) ?? '';
      if (prefix.isEmpty ||
          region.isEmpty ||
          zipLeft.isEmpty ||
          zipRight.isEmpty) {
        return match.group(0) ?? line;
      }

      final int combinedZipLength = zipLeft.length + zipRight.length;
      final bool validZipShape =
          combinedZipLength == _postalCodeLength &&
          zipLeft.length >= _postalCodeChunkMinLength &&
          zipLeft.length <= _postalCodeChunkMaxLength &&
          zipRight.length >= _postalCodeChunkMinLength &&
          zipRight.length <= _postalCodeChunkMaxLength;
      if (!validZipShape) {
        return match.group(0) ?? line;
      }

      return '$prefix$region $zipLeft$zipRight';
    },
  );
}