resolveILAmbiguity function

String resolveILAmbiguity(
  1. String line
)

Resolves I/l ambiguity based on word-level case context.

Uppercase 'I' and lowercase 'l' are nearly identical vertical strokes that OCR frequently confuses. When the surrounding letters in a word establish a dominant case, the ambiguous glyph is matched to that case.

Positional exceptions:

  • An 'I' at the START of a word is never changed: it could be a capitalized word like "In" or "Is".
  • An 'I' preceded by another uppercase letter is never changed: it is likely part of an acronym suffix like "OpenAI" or "GPT-4I".

Implementation

String resolveILAmbiguity(String line) {
  return line.replaceAllMapped(RegExp(r'[A-Za-z]+'), (match) {
    final String word = match.group(0)!;
    if (word.length < _minWordLengthForILAmbiguity) {
      return word;
    }

    // Count unambiguous case characters (exclude I and l)
    int upper = 0;
    int lower = 0;
    bool hasI = false;
    bool hasLowerL = false;
    for (int i = 0; i < word.length; i++) {
      final int code = word.codeUnitAt(i);
      if (code == _uppercaseICodeUnit) {
        hasI = true;
      } else if (code == _lowercaseLCodeUnit) {
        hasLowerL = true;
      } else if (isUpper(code)) {
        upper++;
      } else if (isLower(code)) {
        lower++;
      }
    }

    if (!hasI && !hasLowerL) {
      return word;
    }

    if (upper > lower && hasLowerL) {
      return word.replaceAll('l', 'I');
    }

    if (lower > upper && hasI) {
      // Only convert I→l when the I is clearly in a lowercase context:
      // skip I at position 0 (could be a capitalized word like "In")
      // and skip I preceded by an uppercase letter (acronym like "OpenAI").
      final StringBuffer sb = StringBuffer();
      for (int i = 0; i < word.length; i++) {
        final int code = word.codeUnitAt(i);
        if (code == _uppercaseICodeUnit) {
          final bool atStart = i == 0;
          final bool afterUpper = i > 0 && isUpper(word.codeUnitAt(i - 1));
          if (atStart || afterUpper) {
            sb.writeCharCode(code); // keep as I
          } else {
            sb.write('l'); // convert to l
          }
        } else {
          sb.writeCharCode(code);
        }
      }
      return sb.toString();
    }

    return word;
  });
}