normalizeStandaloneUpperDigitTokenSplit function - post_process_line library

textify package
documentation
post_process_line.dart
normalizeStandaloneUpperDigitTokenSplit function

normalizeStandaloneUpperDigitTokenSplit function

String normalizeStandaloneUpperDigitTokenSplit(

String line

)

Splits short standalone uppercase-digit tokens in mixed-case table rows.

OCR can merge a quantity initial and following digit into tokens like B1 in rows such as Widget B1 7-50. When the token sits between a mixed-case word label and a numeric value token, insert the missing space.

Implementation

String normalizeStandaloneUpperDigitTokenSplit(String line) {
  final List<String> pieces = RegExp(
    r'\S+|\s+',
  ).allMatches(line).map((Match match) => match.group(0) ?? '').toList();
  if (pieces.isEmpty) {
    return line;
  }

  for (int i = 0; i < pieces.length; i++) {
    final String current = pieces[i];
    if (current.trim().isEmpty || !RegExp(r'^[A-Z][0-9]$').hasMatch(current)) {
      continue;
    }

    String? previousToken;
    for (int p = i - 1; p >= 0; p--) {
      if (pieces[p].trim().isNotEmpty) {
        previousToken = pieces[p];
        break;
      }
    }

    String? nextToken;
    for (int n = i + 1; n < pieces.length; n++) {
      if (pieces[n].trim().isNotEmpty) {
        nextToken = pieces[n];
        break;
      }
    }

    final bool mixedCaseWordBefore =
        previousToken != null &&
        RegExp(r'^[A-Za-z]{3,}$').hasMatch(previousToken) &&
        RegExp(r'[a-z]').hasMatch(previousToken);
    final bool numericTokenAfter =
        nextToken != null && RegExp(r'^\d[\d.,/-]*$').hasMatch(nextToken);
    if (!mixedCaseWordBefore || !numericTokenAfter) {
      continue;
    }

    pieces[i] = '${current[0]} ${current[1]}';
  }

  return pieces.join();
}