resolveILAmbiguity function
Resolves I/l ambiguity based on word-level case context.
Uppercase 'I' and lowercase 'l' are nearly identical vertical strokes that OCR frequently confuses. When the surrounding letters in a word establish a dominant case, the ambiguous glyph is matched to that case.
Positional exceptions:
- An 'I' at the START of a word is never changed: it could be a capitalized word like "In" or "Is".
- An 'I' preceded by another uppercase letter is never changed: it is likely part of an acronym suffix like "OpenAI" or "GPT-4I".
Implementation
String resolveILAmbiguity(String line) {
return line.replaceAllMapped(RegExp(r'[A-Za-z]+'), (match) {
final String word = match.group(0)!;
if (word.length < _minWordLengthForILAmbiguity) {
return word;
}
// Count unambiguous case characters (exclude I and l)
int upper = 0;
int lower = 0;
bool hasI = false;
bool hasLowerL = false;
for (int i = 0; i < word.length; i++) {
final int code = word.codeUnitAt(i);
if (code == _uppercaseICodeUnit) {
hasI = true;
} else if (code == _lowercaseLCodeUnit) {
hasLowerL = true;
} else if (isUpper(code)) {
upper++;
} else if (isLower(code)) {
lower++;
}
}
if (!hasI && !hasLowerL) {
return word;
}
if (upper > lower && hasLowerL) {
return word.replaceAll('l', 'I');
}
if (lower > upper && hasI) {
// Only convert I→l when the I is clearly in a lowercase context:
// skip I at position 0 (could be a capitalized word like "In")
// and skip I preceded by an uppercase letter (acronym like "OpenAI").
final StringBuffer sb = StringBuffer();
for (int i = 0; i < word.length; i++) {
final int code = word.codeUnitAt(i);
if (code == _uppercaseICodeUnit) {
final bool atStart = i == 0;
final bool afterUpper = i > 0 && isUpper(word.codeUnitAt(i - 1));
if (atStart || afterUpper) {
sb.writeCharCode(code); // keep as I
} else {
sb.write('l'); // convert to l
}
} else {
sb.writeCharCode(code);
}
}
return sb.toString();
}
return word;
});
}