normalizeStructuredFieldLine function
Normalizes simple structured field lines such as Name: john smith.
This preserves common OCR output for label/value layouts without changing general prose. The label is title-cased when it is purely alphabetic, and the value is title-cased only when it is a multi-word alphabetic phrase.
Implementation
String normalizeStructuredFieldLine(
String line, {
required bool applyDictionary,
}) {
final String colonNormalized = _normalizeMissingStructuredFieldColon(line);
final Match? match = RegExp(
r'^\s*([A-Za-z]+(?:\s+[A-Za-z]+){0,2})\s*:\s*(.+?)\s*$',
).firstMatch(colonNormalized);
if (match == null) {
return colonNormalized;
}
final String rawLabel = match.group(1) ?? '';
final String rawValue = match.group(regexGroupSecond) ?? '';
if (rawLabel.isEmpty || rawValue.isEmpty) {
return colonNormalized;
}
final List<String> labelTokens = rawLabel
.split(RegExp(r'\s+'))
.where((String token) => token.isNotEmpty)
.toList();
if (labelTokens.isEmpty ||
labelTokens.length > _structuredFieldLabelMaxWords) {
return line;
}
if (!labelTokens.every(isAlphaWord)) {
return line;
}
final bool preserveUppercaseCodeLabel =
labelTokens.length == 1 &&
_isUppercaseCodeLabel(labelTokens.first) &&
_looksStructuredCodeLikeValue(rawValue);
final String normalizedLabel = preserveUppercaseCodeLabel
? labelTokens.first.toUpperCase()
: labelTokens
.map(
(String token) => _normalizeStructuredLabelToken(
token,
applyDictionary: applyDictionary,
),
)
.map((String token) => toTitleCaseWord(token))
.join(' ');
String normalizedValue = rawValue;
final List<String> valueTokens = rawValue
.split(RegExp(r'\s+'))
.where((String token) => token.isNotEmpty)
.toList();
final bool nameLikeLabel =
applyDictionary &&
labelTokens.length == 1 &&
labelTokens.first.toLowerCase() == 'name';
final bool alphaPhrase =
valueTokens.length >= _structuredFieldValueMinWords &&
valueTokens.length <= _structuredFieldValueMaxWords &&
valueTokens.every(isAlphaWord);
if (alphaPhrase) {
normalizedValue = valueTokens
.map(
(String token) =>
nameLikeLabel ? _normalizeNameLikeToken(token) : token,
)
.map((String token) => toTitleCaseWord(token))
.join(' ');
}
final bool statusLikeLabel =
applyDictionary &&
labelTokens.length == 1 &&
labelTokens.first.toLowerCase() == 'status' &&
valueTokens.length == 1 &&
isAlphaWord(valueTokens.first);
if (statusLikeLabel) {
normalizedValue = _normalizeStructuredStatusValue(valueTokens.first);
}
normalizedValue = _normalizeStructuredCodeValue(normalizedValue);
return '$normalizedLabel: $normalizedValue';
}