normalizeDigitSegments function

String normalizeDigitSegments(
  1. String line
)

Corrects letter-like confusions inside digit-dominant token segments.

Implementation

String normalizeDigitSegments(String line) {
  // Split line into alternating alnum-segments and separators.
  final List<String> tokens = [];
  final List<bool> tokenIsAlnum = [];
  final StringBuffer buf = StringBuffer();
  bool? currentAlnum;

  for (int i = 0; i < line.length; i++) {
    final int code = line.codeUnitAt(i);
    final bool alnum = isLetter(code) || isDigit(code);
    if (currentAlnum != null && alnum != currentAlnum) {
      tokens.add(buf.toString());
      tokenIsAlnum.add(currentAlnum);
      buf.clear();
    }
    buf.writeCharCode(code);
    currentAlnum = alnum;
  }
  if (buf.isNotEmpty && currentAlnum != null) {
    tokens.add(buf.toString());
    tokenIsAlnum.add(currentAlnum);
  }

  bool isDigitDominant(String s) {
    int d = 0, l = 0;
    for (int i = 0; i < s.length; i++) {
      final int c = s.codeUnitAt(i);
      if (isDigit(c)) {
        d++;
      } else if (isLetter(c)) {
        l++;
      }
    }
    return d > 0 && d >= l;
  }

  bool isAllLetters(String s) {
    for (int i = 0; i < s.length; i++) {
      if (!isLetter(s.codeUnitAt(i))) return false;
    }
    return s.isNotEmpty;
  }

  final StringBuffer out = StringBuffer();
  for (int ti = 0; ti < tokens.length; ti++) {
    if (!tokenIsAlnum[ti]) {
      out.write(tokens[ti]);
      continue;
    }

    String segment = tokens[ti];
    int digits = 0, letters = 0;
    for (int i = 0; i < segment.length; i++) {
      final int c = segment.codeUnitAt(i);
      if (isDigit(c)) {
        digits++;
      } else if (isLetter(c)) {
        letters++;
      }
    }

    if (digits > 0 && digits >= letters) {
      // Digit-dominant: convert letters adjacent to at least one digit.
      final StringBuffer mapped = StringBuffer();
      for (int i = 0; i < segment.length; i++) {
        final int code = segment.codeUnitAt(i);
        if (isLetter(code)) {
          final bool prevDig = i > 0 && isDigit(segment.codeUnitAt(i - 1));
          final bool nextDig =
              i + 1 < segment.length && isDigit(segment.codeUnitAt(i + 1));
          if (prevDig || nextDig) {
            mapped.write(digitConfusionMap[segment[i]] ?? segment[i]);
          } else {
            mapped.write(segment[i]);
          }
        } else {
          mapped.write(segment[i]);
        }
      }
      segment = mapped.toString();
    } else if (isAllLetters(segment) &&
        (_isGridLikeDigitToken(
              tokens,
              tokenIsAlnum,
              ti,
              segment,
              isDigitDominant,
            ) ||
            segment.length <= _maxShortLetterSegmentLength)) {
      // Short all-letter segment near digit-dominant neighbors
      // e.g. "2020-Ol-02" → "Ol" between "2020" and "02" → "01".
      bool prevDigit = false, nextDigit = false;
      for (int p = ti - 1; p >= 0; p--) {
        if (tokenIsAlnum[p]) {
          prevDigit = isDigitDominant(tokens[p]);
          break;
        }
      }
      for (int n = ti + 1; n < tokens.length; n++) {
        if (tokenIsAlnum[n]) {
          nextDigit = isDigitDominant(tokens[n]);
          break;
        }
      }
      // Both neighbors digit-dominant → always convert.
      // Single neighbor digit-dominant → only convert when every
      // character is a high-confidence digit lookalike (O, l, I, S, Z).
      bool convert = false;
      if (prevDigit && nextDigit) {
        convert = true;
      } else if (prevDigit || nextDigit) {
        convert = true;
        for (int i = 0; i < segment.length; i++) {
          if (!highConfidenceDigitLookalikes.contains(segment[i])) {
            convert = false;
            break;
          }
        }
      }
      if (convert) {
        final StringBuffer mapped = StringBuffer();
        for (int i = 0; i < segment.length; i++) {
          mapped.write(digitConfusionMap[segment[i]] ?? segment[i]);
        }
        segment = mapped.toString();
      }
    }

    out.write(segment);
  }

  return out.toString();
}