normalizePunctuationHeavyText function
Normalizes lines that are overwhelmingly punctuation.
Implementation
String normalizePunctuationHeavyText(String text) {
final List<String> lines = text.split('\n');
final List<String> filtered = <String>[];
for (final String line in lines) {
if (line.isEmpty) {
filtered.add(line);
continue;
}
int punctuation = 0;
int alphanumeric = 0;
for (int i = 0; i < line.length; i++) {
final int code = line.codeUnitAt(i);
if (isLetter(code) || isDigit(code)) {
alphanumeric++;
} else if (line[i] != ' ') {
punctuation++;
}
}
if (line.length > _punctuationFilterShortLineMaxLength) {
filtered.add(line);
continue;
}
if (alphanumeric == 0 && punctuation > 0) {
continue;
}
if (punctuation / (punctuation + alphanumeric) >
_punctuationHeavyRatioThreshold) {
continue;
}
filtered.add(line);
}
return filtered.join('\n');
}