normalizeFragmentedLine function
Repairs fragmented words and common letter confusions in noisy lines.
Implementation
String normalizeFragmentedLine(String line, {bool applyDictionary = true}) {
if (!_looksFragmented(line)) {
return line;
}
String value = line;
// 1. Acronym context: 'l' (lowercase L) follows uppercase and isn't followed by lowercase
// This catches 'AlB' -> 'AIB' and 'OpenAl' -> 'OpenAI'
value = value.replaceAllMapped(
RegExp(r'(?<=[A-Z])l(?![a-z])'),
(_) => OcrTokens.upperI,
);
// 2. Common 2-letter confusion: 'ln' -> 'In'
value = value.replaceAllMapped(RegExp(r'\bln\b'), (_) => 'In');
value = value.replaceAllMapped(
RegExp(r'(?<=[a-z])I(?=[a-z])'),
(_) => OcrTokens.lowerL,
);
// 3. Multi-character fragmentation: 'c o d e' -> 'code'
// This helps reconstruct spaced out words in a single pass.
// When applyDictionary is true, avoids merging across word boundaries
// (e.g. 'w i d g e t A' stays 'widget A' instead of becoming 'widgetA').
value = value.replaceAllMapped(
RegExp(r'(?<![A-Za-z])([A-Za-z])(?:\s+([A-Za-z]))+(?![A-Za-z])'),
(Match match) {
final String fullMatch = match.group(0)!;
final String merged = fullMatch.replaceAll(RegExp(r'\s+'), '');
if (applyDictionary) {
if (englishWords.contains(merged.toLowerCase())) {
return merged;
}
final String? split = _trySplitMergedCharacters(merged);
if (split != null) {
return split;
}
}
return merged;
},
);
if (applyDictionary) {
value = _mergeLikelyWordFragments(value);
}
return value;
}