splitByTokens method

List<String> splitByTokens(
  1. String text,
  2. int chunkSize, {
  3. int overlap = 0,
})

Splits text into chunks of approximately chunkSize tokens.

If overlap is provided, each chunk overlaps with the previous one by that many tokens.

Implementation

List<String> splitByTokens(String text, int chunkSize, {int overlap = 0}) {
  if (text.isEmpty) return [];
  final chunks = <String>[];
  var start = 0;
  while (start < text.length) {
    // Find end position for this chunk.
    var end = text.length;
    while (_encoder.count(text.substring(start, end)) > chunkSize &&
        end > start + 1) {
      end =
          start +
          ((end - start) *
                  chunkSize /
                  _encoder.count(text.substring(start, end)))
              .floor();
      if (end <= start) end = start + 1;
    }
    chunks.add(text.substring(start, end));
    if (end >= text.length) break;
    // Move start back by overlap.
    if (overlap > 0 && chunks.length > 1) {
      final overlapChars = (overlap * 4).clamp(0, end - start);
      start = end - overlapChars;
    } else {
      start = end;
    }
  }
  return chunks;
}