normalizeStructuredNumericFieldValue function
Normalizes numeric-like values in simple structured field lines.
This targets lines like Date: zoz5-06-15 and Amount: ], z5o.75, where
the label is alphabetic and the value has a date-like or decimal-like
numeric shape polluted only by OCR digit lookalikes.
Implementation
String normalizeStructuredNumericFieldValue(String line) {
if (line.isEmpty) {
return line;
}
final Match? match = RegExp(
r'^\s*([A-Za-z]+(?:\s+[A-Za-z]+){0,2})\s*:\s*(.+?)\s*$',
).firstMatch(line);
if (match == null) {
return line;
}
final String label = match.group(_structuredFieldLabelGroup) ?? '';
final String rawValue = match.group(_structuredFieldValueGroup) ?? '';
if (label.isEmpty || rawValue.isEmpty) {
return line;
}
final int labelWordCount = label
.split(RegExp(r'\s+'))
.where((String token) => token.isNotEmpty)
.length;
if (labelWordCount == 0 || labelWordCount > _structuredFieldMaxLabelWords) {
return line;
}
String? normalizedValue;
if (_looksDateLikeStructuredNumericValue(rawValue)) {
normalizedValue = normalizeDateSeparators(
_mapDigitLookalikesInValue(rawValue),
);
} else if (_looksDecimalStructuredNumericValue(rawValue)) {
normalizedValue = normalizeDateSeparators(
_mapDigitLookalikesInValue(rawValue),
);
}
if (normalizedValue == null || normalizedValue == rawValue) {
return line;
}
return '$label: $normalizedValue';
}