normalizeStandaloneUpperDigitTokenSplit function
Splits short standalone uppercase-digit tokens in mixed-case table rows.
OCR can merge a quantity initial and following digit into tokens like B1
in rows such as Widget B1 7-50. When the token sits between a mixed-case
word label and a numeric value token, insert the missing space.
Implementation
String normalizeStandaloneUpperDigitTokenSplit(String line) {
final List<String> pieces = RegExp(
r'\S+|\s+',
).allMatches(line).map((Match match) => match.group(0) ?? '').toList();
if (pieces.isEmpty) {
return line;
}
for (int i = 0; i < pieces.length; i++) {
final String current = pieces[i];
if (current.trim().isEmpty || !RegExp(r'^[A-Z][0-9]$').hasMatch(current)) {
continue;
}
String? previousToken;
for (int p = i - 1; p >= 0; p--) {
if (pieces[p].trim().isNotEmpty) {
previousToken = pieces[p];
break;
}
}
String? nextToken;
for (int n = i + 1; n < pieces.length; n++) {
if (pieces[n].trim().isNotEmpty) {
nextToken = pieces[n];
break;
}
}
final bool mixedCaseWordBefore =
previousToken != null &&
RegExp(r'^[A-Za-z]{3,}$').hasMatch(previousToken) &&
RegExp(r'[a-z]').hasMatch(previousToken);
final bool numericTokenAfter =
nextToken != null && RegExp(r'^\d[\d.,/-]*$').hasMatch(nextToken);
if (!mixedCaseWordBefore || !numericTokenAfter) {
continue;
}
pieces[i] = '${current[0]} ${current[1]}';
}
return pieces.join();
}