correctNearMissDictionaryWords function
Corrects near-miss dictionary words with strict edit-distance limits.
Implementation
String correctNearMissDictionaryWords(String line) {
if (line.isEmpty) {
return line;
}
final bool allowUppercaseProseCorrection = _looksLikeUppercaseProseLine(line);
return line.replaceAllMapped(RegExp(r'[A-Za-z]+'), (Match match) {
final String token = match.group(0)!;
final bool tokenIsUppercase = token == token.toUpperCase();
if (token.length < _nearMissMinTokenLength) {
return token;
}
// Protect mixed-case words (e.g., 'OpenAI') and acronyms (e.g., 'GPT')
// from being "corrected" to lowercase dictionary words.
if (isMixedCase(token)) {
return token;
}
if (isAcronym(token) &&
!(allowUppercaseProseCorrection && tokenIsUppercase)) {
return token;
}
final String lower = token.toLowerCase();
if (englishWords.contains(lower)) {
return token;
}
String suggestion = findClosestMatchingWordInDictionary(token);
bool allowCorrection = false;
if (suggestion.isNotEmpty && suggestion.length == token.length) {
final int distance = levenshteinDistance(lower, suggestion.toLowerCase());
int diffCount = 0;
int confusionDiffCount = 0;
bool validSameLengthSuggestion = true;
for (int i = 0; i < token.length; i++) {
if (token[i].toLowerCase() != suggestion[i].toLowerCase()) {
diffCount++;
if (!isOcrConfusionPair(token[i], suggestion[i])) {
validSameLengthSuggestion = false;
break;
}
confusionDiffCount++;
}
}
if (validSameLengthSuggestion) {
final bool allowSingleConfusionCorrection =
distance == 1 && diffCount == 1 && confusionDiffCount == 1;
final bool allowUppercaseDoubleConfusionCorrection =
allowUppercaseProseCorrection &&
tokenIsUppercase &&
token.length >= _uppercaseNearMissMinTokenLength &&
distance == _uppercaseNearMissDistance &&
diffCount == _uppercaseNearMissDistance &&
confusionDiffCount == _uppercaseNearMissDistance;
allowCorrection =
allowSingleConfusionCorrection ||
allowUppercaseDoubleConfusionCorrection;
}
}
if (!allowCorrection &&
allowUppercaseProseCorrection &&
tokenIsUppercase &&
token.length >= _uppercaseNearMissMinTokenLength) {
final String? flexibleSuggestion =
_findClosestUppercaseLengthFlexibleSuggestion(token);
if (flexibleSuggestion != null) {
suggestion = flexibleSuggestion;
allowCorrection = true;
}
}
if (!allowCorrection) {
return token;
}
if (isTitleCaseWord(token)) {
return toTitleCaseWord(suggestion);
}
if (token == token.toLowerCase()) {
return suggestion.toLowerCase();
}
if (tokenIsUppercase) {
return suggestion.toUpperCase();
}
return suggestion;
});
}