normalizeDigitSegments function
Corrects letter-like confusions inside digit-dominant token segments.
Implementation
String normalizeDigitSegments(String line) {
// Split line into alternating alnum-segments and separators.
final List<String> tokens = [];
final List<bool> tokenIsAlnum = [];
final StringBuffer buf = StringBuffer();
bool? currentAlnum;
for (int i = 0; i < line.length; i++) {
final int code = line.codeUnitAt(i);
final bool alnum = isLetter(code) || isDigit(code);
if (currentAlnum != null && alnum != currentAlnum) {
tokens.add(buf.toString());
tokenIsAlnum.add(currentAlnum);
buf.clear();
}
buf.writeCharCode(code);
currentAlnum = alnum;
}
if (buf.isNotEmpty && currentAlnum != null) {
tokens.add(buf.toString());
tokenIsAlnum.add(currentAlnum);
}
bool isDigitDominant(String s) {
int d = 0, l = 0;
for (int i = 0; i < s.length; i++) {
final int c = s.codeUnitAt(i);
if (isDigit(c)) {
d++;
} else if (isLetter(c)) {
l++;
}
}
return d > 0 && d >= l;
}
bool isAllLetters(String s) {
for (int i = 0; i < s.length; i++) {
if (!isLetter(s.codeUnitAt(i))) return false;
}
return s.isNotEmpty;
}
final StringBuffer out = StringBuffer();
for (int ti = 0; ti < tokens.length; ti++) {
if (!tokenIsAlnum[ti]) {
out.write(tokens[ti]);
continue;
}
String segment = tokens[ti];
int digits = 0, letters = 0;
for (int i = 0; i < segment.length; i++) {
final int c = segment.codeUnitAt(i);
if (isDigit(c)) {
digits++;
} else if (isLetter(c)) {
letters++;
}
}
if (digits > 0 && digits >= letters) {
// Digit-dominant: convert letters adjacent to at least one digit.
final StringBuffer mapped = StringBuffer();
for (int i = 0; i < segment.length; i++) {
final int code = segment.codeUnitAt(i);
if (isLetter(code)) {
final bool prevDig = i > 0 && isDigit(segment.codeUnitAt(i - 1));
final bool nextDig =
i + 1 < segment.length && isDigit(segment.codeUnitAt(i + 1));
if (prevDig || nextDig) {
mapped.write(digitConfusionMap[segment[i]] ?? segment[i]);
} else {
mapped.write(segment[i]);
}
} else {
mapped.write(segment[i]);
}
}
segment = mapped.toString();
} else if (isAllLetters(segment) &&
(_isGridLikeDigitToken(
tokens,
tokenIsAlnum,
ti,
segment,
isDigitDominant,
) ||
segment.length <= _maxShortLetterSegmentLength)) {
// Short all-letter segment near digit-dominant neighbors
// e.g. "2020-Ol-02" → "Ol" between "2020" and "02" → "01".
bool prevDigit = false, nextDigit = false;
for (int p = ti - 1; p >= 0; p--) {
if (tokenIsAlnum[p]) {
prevDigit = isDigitDominant(tokens[p]);
break;
}
}
for (int n = ti + 1; n < tokens.length; n++) {
if (tokenIsAlnum[n]) {
nextDigit = isDigitDominant(tokens[n]);
break;
}
}
// Both neighbors digit-dominant → always convert.
// Single neighbor digit-dominant → only convert when every
// character is a high-confidence digit lookalike (O, l, I, S, Z).
bool convert = false;
if (prevDigit && nextDigit) {
convert = true;
} else if (prevDigit || nextDigit) {
convert = true;
for (int i = 0; i < segment.length; i++) {
if (!highConfidenceDigitLookalikes.contains(segment[i])) {
convert = false;
break;
}
}
}
if (convert) {
final StringBuffer mapped = StringBuffer();
for (int i = 0; i < segment.length; i++) {
mapped.write(digitConfusionMap[segment[i]] ?? segment[i]);
}
segment = mapped.toString();
}
}
out.write(segment);
}
return out.toString();
}