documentTokenizer function

TokenizationOutput documentTokenizer(
  1. List<String> documentList, {
  2. dynamic minLen = 1,
  3. String stemmer(
    1. String
    )?,
  4. List<String>? stopwords,
})

Simple document tokenization.

Only processes [a-z A-Z 0-9].

May use external stemmer if available.

Implementation

TokenizationOutput documentTokenizer(List<String> documentList,
    {minLen = 1, String Function(String)? stemmer, List<String>? stopwords}) {
  TokenizationOutput tokenOut = TokenizationOutput();

  for (int k = 0; k < documentList.length; k++) {
    Map<String, double> currentBOW = {};
    int currentTotalWord = 0;
    List<String> words = documentList[k]
        .toLowerCase()
        .replaceAll(RegExp(r"[^a-z0-9 ]"), "")
        .split(" ");
    List<String> contentWords = [];

    if (words.length >= minLen) {
      //remove short sentences
      for (int j = 0; j < words.length; j++) {
        String word = stemmer != null
            ? stemmer(words[j])
            : words[j]; //use stemmer if available
        if (word.trim().isNotEmpty && (stopwords==null || !stopwords.contains(word))) {
          //not empty and not a stopword
          contentWords.add(word);

          //general bag of words of whole input
          if (tokenOut.bagOfWords.containsKey(word)) {
            tokenOut.bagOfWords[word] = tokenOut.bagOfWords[word]!+1; //current sentence bag of word
          } else {
            tokenOut.bagOfWords[word] = 1;
            tokenOut.numberOfDistintWords++;
          }

          //document specific bag of words
          if (currentBOW.containsKey(word)) {
            currentBOW[word] = currentBOW[word]!+1; //current sentence bag of word
          } else {
            currentBOW[word] = 1;
            //found in this document for the first time
            if (tokenOut.wordInDocumentOccurrence.containsKey(word)) {
              tokenOut.wordInDocumentOccurrence[word] = tokenOut.wordInDocumentOccurrence[word]!+1;
            } else {
              tokenOut.wordInDocumentOccurrence[word] = 1;
            }
          }
        }
      }
      tokenOut.totalNumberOfWords += contentWords.length;
      currentTotalWord += contentWords.length;
    }
    //add stats for each document
    tokenOut.documentBOW.add(currentBOW);
    tokenOut.documentTotalWord.add(currentTotalWord);
  }

  return tokenOut;
}