normalizeCasingOfSentence function
Processes a sentence and applies appropriate casing rules.
This function takes a sentence string and ensures the first letter is capitalized. Returns the processed sentence with normalized casing.
Implementation
String normalizeCasingOfSentence(final String sentence) {
if (sentence.isEmpty) {
return sentence;
}
// Preserve codes and IDs, but keep normal prose lines with dates/numbers
// eligible for sentence-level case cleanup.
if (hasCodeLikeToken(sentence) ||
_hasStructuredShortUppercaseFieldValue(sentence)) {
return sentence;
}
// Count letters to determine dominant case
int upper = 0;
int lower = 0;
for (int i = 0; i < sentence.length; i++) {
final String char = sentence[i];
if (isUppercaseLetter(char)) {
upper++;
} else if (isLowercaseLetter(char)) {
lower++;
}
}
// If the sentence is mostly uppercase, preserve it (e.g., "HELLO WORLD")
if (upper > lower && upper > 1) {
return sentence;
}
// If multiple words start with an uppercase letter and each capitalized word
// has clean casing (first letter upper, remaining letters lower or non-letter),
// the sentence likely uses title case, proper nouns, or acronyms.
// Preserve the original casing instead of blanket-lowercasing.
// Words with noisy internal uppercase like "CaSe" disqualify the sentence.
const int noisyCasingTransitionThreshold = 2;
final List<String> words = sentence.trim().split(RegExp(r'\s+'));
int titleCaseTokenCount = 0;
int acronymTokenCount = 0;
int alphaWordCount = 0;
bool hasNoisyCasing = false;
for (final String word in words) {
if (word.isEmpty || !isLetter(word[0])) continue;
alphaWordCount++;
if (isUppercaseLetter(word[0])) {
// Skip words containing digits — these are likely codes or dates
// with OCR-confused leading letters (e.g., "O3/15/2025"), not
// genuine title-case words.
bool hasDigit = false;
for (int ci = 0; ci < word.length; ci++) {
if (isDigit(word[ci])) {
hasDigit = true;
break;
}
}
if (hasDigit) continue;
final String alphaOnly = word.replaceAll(RegExp(r'[^A-Za-z]'), '');
if (alphaOnly.isNotEmpty && alphaOnly == alphaOnly.toUpperCase()) {
if (_isRestorableSentenceAcronym(word)) {
acronymTokenCount++;
}
continue;
}
titleCaseTokenCount++;
// Detect noisy internal casing by counting case transitions in the tail.
// "CaSe" tail: L,U,L → 2 transitions → noisy OCR artifact.
// "OpenAI" tail: L,L,L,U,U → 1 transition → valid camelCase/brand.
// "Released" tail: all lower → 0 transitions → clean title case.
int transitions = 0;
bool? lastWasUpper;
for (int ci = 1; ci < word.length; ci++) {
if (isUppercaseLetter(word[ci])) {
if (lastWasUpper == false) transitions++;
lastWasUpper = true;
} else if (isLowercaseLetter(word[ci])) {
if (lastWasUpper == true) transitions++;
lastWasUpper = false;
}
}
if (transitions >= noisyCasingTransitionThreshold) {
hasNoisyCasing = true;
}
}
}
// Only treat as title-case when the majority of alphabetic words start
// uppercase. A stray OCR capitalization (e.g., "With" for "with") should
// not trigger short-word capitalization across the whole sentence.
final bool isTitleCase =
titleCaseTokenCount > 1 &&
!hasNoisyCasing &&
alphaWordCount > 0 &&
titleCaseTokenCount > alphaWordCount ~/ _titleCaseMajorityDivisor;
if (isTitleCase) {
// In title-case sentences, capitalize very short (1-2 char) lowercase
// words to match the dominant pattern. These often arise from OCR
// confusion between 'l' and 'I' producing "in" instead of "In".
return _capitalizeVeryShortLowercaseWords(sentence);
}
// Preserve sentences with multiple uppercase-starting words even when
// the strict title-case threshold is not met (e.g., one stray OCR
// capitalization among many lowercase words).
if (!hasNoisyCasing &&
(titleCaseTokenCount > 1 ||
(titleCaseTokenCount == 1 &&
acronymTokenCount == 1 &&
alphaWordCount <= _shortAcronymPhraseMaxWords))) {
if (titleCaseTokenCount > 1 && acronymTokenCount > 0) {
return _capitalizeVeryShortLowercaseWords(sentence);
}
return sentence;
}
final String trimmed = sentence.trimLeft();
if (trimmed.isEmpty) {
return sentence;
}
final int offset = sentence.length - trimmed.length;
if (shouldPreserveLongLowercaseProse(
trimmed,
minTokens: _longLowercaseSentenceMinTokens,
minLetters: _longLowercaseSentenceMinLetters,
)) {
return sentence.substring(0, offset) + trimmed.toLowerCase();
}
String content = trimmed.toLowerCase();
final String firstChar = content[0];
if (isLetter(firstChar)) {
content = firstChar.toUpperCase() + content.substring(1);
}
final StringBuffer restoredAcronyms = StringBuffer();
int restoreIndex = 0;
for (final Match match in RegExp(r'[A-Za-z][A-Za-z.]*').allMatches(trimmed)) {
restoredAcronyms.write(content.substring(restoreIndex, match.start));
final String originalToken = match.group(0)!;
if (_isRestorableSentenceAcronym(originalToken)) {
restoredAcronyms.write(originalToken);
} else {
restoredAcronyms.write(content.substring(match.start, match.end));
}
restoreIndex = match.end;
}
restoredAcronyms.write(content.substring(restoreIndex));
content = restoredAcronyms.toString();
// Restore standalone single uppercase letters from the original text.
// Words like "A" (article) and "I" (pronoun) should preserve their
// uppercase form even in predominantly lowercase sentences.
for (int i = 0; i < content.length && i < trimmed.length; i++) {
if (isUppercaseLetter(trimmed[i])) {
final bool atStart = i == 0 || !isLetter(trimmed[i - 1]);
final bool atEnd = i == trimmed.length - 1 || !isLetter(trimmed[i + 1]);
if (atStart && atEnd) {
content =
content.substring(0, i) + trimmed[i] + content.substring(i + 1);
}
}
}
return sentence.substring(0, offset) + content;
}