applyDictionaryCorrectionOnSingleSentence function

String applyDictionaryCorrectionOnSingleSentence(
  1. String inputSentence,
  2. Map<String, List<String>> correctionLetters
)

Applies dictionary-based correction to inputSentence. It first tries to match words directly in the dictionary, then attempts to substitute commonly confused characters correctionLetters, and finally finds the closest match in the dictionary if no direct match is found. The original casing of the input words is preserved in the corrected output.

Implementation

String applyDictionaryCorrectionOnSingleSentence(
  final String inputSentence,
  final Map<String, List<String>> correctionLetters,
) {
  final regex = RegExp(
    r'(\s+|[.,!?;:])',
  ); // Matches spaces or single punctuation marks

  final words = inputSentence
      .splitMapJoin(
        regex,
        onMatch: (m) => '¤${m[0]}¤', // Tag matched pieces
        onNonMatch: (n) => '¤$n¤', // Tag non-matched parts (i.e., words)
      )
      .split('¤')
      .where((s) => s.isNotEmpty)
      .toList();

  for (int i = 0; i < words.length; i++) {
    String word = words[i];
    if (word.length >= _minDictionaryTokenLength &&
        !['.', ',', '!', '?', ';', ':', ' '].contains(word)) {
      final stats = CharacterStats(word);
      // No need to process numbers or symbol-heavy tokens
      if (!stats.mostlyDigits() && !stats.mostlyPunctuation()) {
        //
        // Try direct dictionary match first
        //
        if (!englishWords.contains(word.toLowerCase())) {
          if (_shouldProtectUppercaseDictionaryCorrectionInCommaLine(
            inputSentence,
            word,
          )) {
            continue;
          }

          //
          // Try substituting commonly confused characters.
          // First try single substitution types, then try pairs
          // for words with multiple confusion types (e.g. o→e + l→i).
          //
          String modifiedWord = word;
          bool foundMatch = false;

          // Collect all valid single-substitution variants.
          final List<MapEntry<String, String>> singleSubs = [];
          for (final MapEntry<String, List<String>> entry
              in correctionLetters.entries) {
            if (modifiedWord.contains(entry.key)) {
              for (final String substitute in entry.value) {
                final String testWord = _caseAwareReplace(
                  modifiedWord,
                  entry.key,
                  substitute,
                );

                if (testWord != modifiedWord) {
                  if (englishWords.contains(testWord.toLowerCase())) {
                    modifiedWord = testWord;
                    foundMatch = true;
                    break;
                  }
                  singleSubs.add(MapEntry(entry.key, substitute));
                }
              }
              if (foundMatch) {
                break;
              }
            }
          }

          // Pass 2: try chaining pairs of substitutions.
          // For multi-substitution corrections, use lowercase matching
          // and apply the original word's dominant casing to the result,
          // since neighbor-based casing from noisy OCR is unreliable
          // when multiple characters are wrong.
          if (!foundMatch &&
              singleSubs.length >= _minSubstitutionPairsRequired) {
            for (int a = 0; a < singleSubs.length && !foundMatch; a++) {
              final String after1 = _caseAwareReplace(
                word,
                singleSubs[a].key,
                singleSubs[a].value,
              );
              for (int b = a + 1; b < singleSubs.length; b++) {
                if (!after1.contains(singleSubs[b].key)) continue;
                final String after2 = _caseAwareReplace(
                  after1,
                  singleSubs[b].key,
                  singleSubs[b].value,
                );
                if (englishWords.contains(after2.toLowerCase())) {
                  // Use lowercase form — let casing normalization handle it.
                  modifiedWord = after2.toLowerCase();
                  foundMatch = true;
                  break;
                }
              }
            }
          }

          if (!foundMatch && word.length >= _minFallbackCorrectionLength) {
            // If no direct match after substitutions, find a conservative
            // same-length near match to avoid over-correcting tokens.
            // Only for words long enough that a single-character edit is
            // proportionally small (≥4 chars = ≤25% change).
            final String suggestion = findClosestMatchingWordInDictionary(word);
            final int distance = levenshteinDistance(
              word.toLowerCase(),
              suggestion.toLowerCase(),
            );
            if (suggestion.length == word.length &&
                distance <= _maxDictionaryFallbackDistance) {
              // Only accept when every changed character is a plausible
              // OCR confusion (e.g., l↔I, 0↔O). This prevents
              // morphological form changes like "Released" → "Releases"
              // where d→s is not an OCR confusion.
              bool acceptFallback = true;
              for (int ci = 0; ci < word.length; ci++) {
                if (word[ci].toLowerCase() != suggestion[ci].toLowerCase()) {
                  if (!isOcrConfusionPair(word[ci], suggestion[ci])) {
                    acceptFallback = false;
                    break;
                  }
                }
              }
              if (acceptFallback) {
                modifiedWord = suggestion;
              }
            }
          }

          words[i] = modifiedWord;
        }
      }
    }
  }

  return normalizeCasingOfParagraph(words.join(''));
}