correctNearMissDictionaryWords function

String correctNearMissDictionaryWords(
  1. String line
)

Corrects near-miss dictionary words with strict edit-distance limits.

Implementation

String correctNearMissDictionaryWords(String line) {
  if (line.isEmpty) {
    return line;
  }

  final bool allowUppercaseProseCorrection = _looksLikeUppercaseProseLine(line);

  return line.replaceAllMapped(RegExp(r'[A-Za-z]+'), (Match match) {
    final String token = match.group(0)!;
    final bool tokenIsUppercase = token == token.toUpperCase();
    if (token.length < _nearMissMinTokenLength) {
      return token;
    }

    // Protect mixed-case words (e.g., 'OpenAI') and acronyms (e.g., 'GPT')
    // from being "corrected" to lowercase dictionary words.
    if (isMixedCase(token)) {
      return token;
    }
    if (isAcronym(token) &&
        !(allowUppercaseProseCorrection && tokenIsUppercase)) {
      return token;
    }

    final String lower = token.toLowerCase();
    if (englishWords.contains(lower)) {
      return token;
    }

    String suggestion = findClosestMatchingWordInDictionary(token);
    bool allowCorrection = false;

    if (suggestion.isNotEmpty && suggestion.length == token.length) {
      final int distance = levenshteinDistance(lower, suggestion.toLowerCase());

      int diffCount = 0;
      int confusionDiffCount = 0;
      bool validSameLengthSuggestion = true;
      for (int i = 0; i < token.length; i++) {
        if (token[i].toLowerCase() != suggestion[i].toLowerCase()) {
          diffCount++;
          if (!isOcrConfusionPair(token[i], suggestion[i])) {
            validSameLengthSuggestion = false;
            break;
          }
          confusionDiffCount++;
        }
      }

      if (validSameLengthSuggestion) {
        final bool allowSingleConfusionCorrection =
            distance == 1 && diffCount == 1 && confusionDiffCount == 1;
        final bool allowUppercaseDoubleConfusionCorrection =
            allowUppercaseProseCorrection &&
            tokenIsUppercase &&
            token.length >= _uppercaseNearMissMinTokenLength &&
            distance == _uppercaseNearMissDistance &&
            diffCount == _uppercaseNearMissDistance &&
            confusionDiffCount == _uppercaseNearMissDistance;

        allowCorrection =
            allowSingleConfusionCorrection ||
            allowUppercaseDoubleConfusionCorrection;
      }
    }

    if (!allowCorrection &&
        allowUppercaseProseCorrection &&
        tokenIsUppercase &&
        token.length >= _uppercaseNearMissMinTokenLength) {
      final String? flexibleSuggestion =
          _findClosestUppercaseLengthFlexibleSuggestion(token);
      if (flexibleSuggestion != null) {
        suggestion = flexibleSuggestion;
        allowCorrection = true;
      }
    }

    if (!allowCorrection) {
      return token;
    }

    if (isTitleCaseWord(token)) {
      return toTitleCaseWord(suggestion);
    }
    if (token == token.toLowerCase()) {
      return suggestion.toLowerCase();
    }
    if (tokenIsUppercase) {
      return suggestion.toUpperCase();
    }
    return suggestion;
  });
}