normalizeRegionPostalCodeSpacing function
Repairs split 5-digit postal codes after uppercase region abbreviations.
OCR can emit address lines like SAN FRANCISCO, CA941 05 or
SAN FRANCISCO, CA 941 05. When a comma is followed by a two-letter
uppercase region token and two digit chunks totaling five digits, keep one
space before the postal code and remove the internal split.
Implementation
String normalizeRegionPostalCodeSpacing(String line) {
if (line.isEmpty) {
return line;
}
return line.replaceAllMapped(
RegExp(r'(,\s*)([A-Z]{2})\s*(\d{2,3})\s+(\d{2,3})(?=\b)'),
(Match match) {
final String prefix = match.group(_postalPrefixGroup) ?? '';
final String region = match.group(_postalRegionGroup) ?? '';
final String zipLeft = match.group(_postalLeftChunkGroup) ?? '';
final String zipRight = match.group(_postalRightChunkGroup) ?? '';
if (prefix.isEmpty ||
region.isEmpty ||
zipLeft.isEmpty ||
zipRight.isEmpty) {
return match.group(0) ?? line;
}
final int combinedZipLength = zipLeft.length + zipRight.length;
final bool validZipShape =
combinedZipLength == _postalCodeLength &&
zipLeft.length >= _postalCodeChunkMinLength &&
zipLeft.length <= _postalCodeChunkMaxLength &&
zipRight.length >= _postalCodeChunkMinLength &&
zipRight.length <= _postalCodeChunkMaxLength;
if (!validZipShape) {
return match.group(0) ?? line;
}
return '$prefix$region $zipLeft$zipRight';
},
);
}