normalizeCasingOfSentence function

String normalizeCasingOfSentence(
  1. String sentence
)

Processes a sentence and applies appropriate casing rules.

This function takes a sentence string and ensures the first letter is capitalized. Returns the processed sentence with normalized casing.

Implementation

String normalizeCasingOfSentence(final String sentence) {
  if (sentence.isEmpty) {
    return sentence;
  }

  // Preserve codes and IDs, but keep normal prose lines with dates/numbers
  // eligible for sentence-level case cleanup.
  if (hasCodeLikeToken(sentence) ||
      _hasStructuredShortUppercaseFieldValue(sentence)) {
    return sentence;
  }

  // Count letters to determine dominant case
  int upper = 0;
  int lower = 0;
  for (int i = 0; i < sentence.length; i++) {
    final String char = sentence[i];
    if (isUppercaseLetter(char)) {
      upper++;
    } else if (isLowercaseLetter(char)) {
      lower++;
    }
  }

  // If the sentence is mostly uppercase, preserve it (e.g., "HELLO WORLD")
  if (upper > lower && upper > 1) {
    return sentence;
  }

  // If multiple words start with an uppercase letter and each capitalized word
  // has clean casing (first letter upper, remaining letters lower or non-letter),
  // the sentence likely uses title case, proper nouns, or acronyms.
  // Preserve the original casing instead of blanket-lowercasing.
  // Words with noisy internal uppercase like "CaSe" disqualify the sentence.
  const int noisyCasingTransitionThreshold = 2;
  final List<String> words = sentence.trim().split(RegExp(r'\s+'));
  int titleCaseTokenCount = 0;
  int acronymTokenCount = 0;
  int alphaWordCount = 0;
  bool hasNoisyCasing = false;
  for (final String word in words) {
    if (word.isEmpty || !isLetter(word[0])) continue;
    alphaWordCount++;
    if (isUppercaseLetter(word[0])) {
      // Skip words containing digits — these are likely codes or dates
      // with OCR-confused leading letters (e.g., "O3/15/2025"), not
      // genuine title-case words.
      bool hasDigit = false;
      for (int ci = 0; ci < word.length; ci++) {
        if (isDigit(word[ci])) {
          hasDigit = true;
          break;
        }
      }
      if (hasDigit) continue;

      final String alphaOnly = word.replaceAll(RegExp(r'[^A-Za-z]'), '');
      if (alphaOnly.isNotEmpty && alphaOnly == alphaOnly.toUpperCase()) {
        if (_isRestorableSentenceAcronym(word)) {
          acronymTokenCount++;
        }
        continue;
      }

      titleCaseTokenCount++;
      // Detect noisy internal casing by counting case transitions in the tail.
      // "CaSe" tail: L,U,L → 2 transitions → noisy OCR artifact.
      // "OpenAI" tail: L,L,L,U,U → 1 transition → valid camelCase/brand.
      // "Released" tail: all lower → 0 transitions → clean title case.
      int transitions = 0;
      bool? lastWasUpper;
      for (int ci = 1; ci < word.length; ci++) {
        if (isUppercaseLetter(word[ci])) {
          if (lastWasUpper == false) transitions++;
          lastWasUpper = true;
        } else if (isLowercaseLetter(word[ci])) {
          if (lastWasUpper == true) transitions++;
          lastWasUpper = false;
        }
      }
      if (transitions >= noisyCasingTransitionThreshold) {
        hasNoisyCasing = true;
      }
    }
  }
  // Only treat as title-case when the majority of alphabetic words start
  // uppercase.  A stray OCR capitalization (e.g., "With" for "with") should
  // not trigger short-word capitalization across the whole sentence.
  final bool isTitleCase =
      titleCaseTokenCount > 1 &&
      !hasNoisyCasing &&
      alphaWordCount > 0 &&
      titleCaseTokenCount > alphaWordCount ~/ _titleCaseMajorityDivisor;
  if (isTitleCase) {
    // In title-case sentences, capitalize very short (1-2 char) lowercase
    // words to match the dominant pattern.  These often arise from OCR
    // confusion between 'l' and 'I' producing "in" instead of "In".
    return _capitalizeVeryShortLowercaseWords(sentence);
  }

  // Preserve sentences with multiple uppercase-starting words even when
  // the strict title-case threshold is not met (e.g., one stray OCR
  // capitalization among many lowercase words).
  if (!hasNoisyCasing &&
      (titleCaseTokenCount > 1 ||
          (titleCaseTokenCount == 1 &&
              acronymTokenCount == 1 &&
              alphaWordCount <= _shortAcronymPhraseMaxWords))) {
    if (titleCaseTokenCount > 1 && acronymTokenCount > 0) {
      return _capitalizeVeryShortLowercaseWords(sentence);
    }
    return sentence;
  }

  final String trimmed = sentence.trimLeft();
  if (trimmed.isEmpty) {
    return sentence;
  }

  final int offset = sentence.length - trimmed.length;
  if (shouldPreserveLongLowercaseProse(
    trimmed,
    minTokens: _longLowercaseSentenceMinTokens,
    minLetters: _longLowercaseSentenceMinLetters,
  )) {
    return sentence.substring(0, offset) + trimmed.toLowerCase();
  }

  String content = trimmed.toLowerCase();
  final String firstChar = content[0];

  if (isLetter(firstChar)) {
    content = firstChar.toUpperCase() + content.substring(1);
  }

  final StringBuffer restoredAcronyms = StringBuffer();
  int restoreIndex = 0;
  for (final Match match in RegExp(r'[A-Za-z][A-Za-z.]*').allMatches(trimmed)) {
    restoredAcronyms.write(content.substring(restoreIndex, match.start));
    final String originalToken = match.group(0)!;
    if (_isRestorableSentenceAcronym(originalToken)) {
      restoredAcronyms.write(originalToken);
    } else {
      restoredAcronyms.write(content.substring(match.start, match.end));
    }
    restoreIndex = match.end;
  }
  restoredAcronyms.write(content.substring(restoreIndex));
  content = restoredAcronyms.toString();

  // Restore standalone single uppercase letters from the original text.
  // Words like "A" (article) and "I" (pronoun) should preserve their
  // uppercase form even in predominantly lowercase sentences.
  for (int i = 0; i < content.length && i < trimmed.length; i++) {
    if (isUppercaseLetter(trimmed[i])) {
      final bool atStart = i == 0 || !isLetter(trimmed[i - 1]);
      final bool atEnd = i == trimmed.length - 1 || !isLetter(trimmed[i + 1]);
      if (atStart && atEnd) {
        content =
            content.substring(0, i) + trimmed[i] + content.substring(i + 1);
      }
    }
  }

  return sentence.substring(0, offset) + content;
}