normalizeStructuredFieldLine function

String normalizeStructuredFieldLine(
  1. String line, {
  2. required bool applyDictionary,
})

Normalizes simple structured field lines such as Name: john smith.

This preserves common OCR output for label/value layouts without changing general prose. The label is title-cased when it is purely alphabetic, and the value is title-cased only when it is a multi-word alphabetic phrase.

Implementation

String normalizeStructuredFieldLine(
  String line, {
  required bool applyDictionary,
}) {
  final String colonNormalized = _normalizeMissingStructuredFieldColon(line);
  final Match? match = RegExp(
    r'^\s*([A-Za-z]+(?:\s+[A-Za-z]+){0,2})\s*:\s*(.+?)\s*$',
  ).firstMatch(colonNormalized);
  if (match == null) {
    return colonNormalized;
  }

  final String rawLabel = match.group(1) ?? '';
  final String rawValue = match.group(regexGroupSecond) ?? '';
  if (rawLabel.isEmpty || rawValue.isEmpty) {
    return colonNormalized;
  }

  final List<String> labelTokens = rawLabel
      .split(RegExp(r'\s+'))
      .where((String token) => token.isNotEmpty)
      .toList();
  if (labelTokens.isEmpty ||
      labelTokens.length > _structuredFieldLabelMaxWords) {
    return line;
  }

  if (!labelTokens.every(isAlphaWord)) {
    return line;
  }

  final bool preserveUppercaseCodeLabel =
      labelTokens.length == 1 &&
      _isUppercaseCodeLabel(labelTokens.first) &&
      _looksStructuredCodeLikeValue(rawValue);

  final String normalizedLabel = preserveUppercaseCodeLabel
      ? labelTokens.first.toUpperCase()
      : labelTokens
            .map(
              (String token) => _normalizeStructuredLabelToken(
                token,
                applyDictionary: applyDictionary,
              ),
            )
            .map((String token) => toTitleCaseWord(token))
            .join(' ');

  String normalizedValue = rawValue;
  final List<String> valueTokens = rawValue
      .split(RegExp(r'\s+'))
      .where((String token) => token.isNotEmpty)
      .toList();
  final bool nameLikeLabel =
      applyDictionary &&
      labelTokens.length == 1 &&
      labelTokens.first.toLowerCase() == 'name';
  final bool alphaPhrase =
      valueTokens.length >= _structuredFieldValueMinWords &&
      valueTokens.length <= _structuredFieldValueMaxWords &&
      valueTokens.every(isAlphaWord);
  if (alphaPhrase) {
    normalizedValue = valueTokens
        .map(
          (String token) =>
              nameLikeLabel ? _normalizeNameLikeToken(token) : token,
        )
        .map((String token) => toTitleCaseWord(token))
        .join(' ');
  }

  final bool statusLikeLabel =
      applyDictionary &&
      labelTokens.length == 1 &&
      labelTokens.first.toLowerCase() == 'status' &&
      valueTokens.length == 1 &&
      isAlphaWord(valueTokens.first);
  if (statusLikeLabel) {
    normalizedValue = _normalizeStructuredStatusValue(valueTokens.first);
  }

  normalizedValue = _normalizeStructuredCodeValue(normalizedValue);

  return '$normalizedLabel: $normalizedValue';
}