encode method

  1. @override
List<int> encode(
  1. String text
)
override

Encodes text into a list of token IDs.

Implementation

@override
List<int> encode(String text) {
  if (text.isEmpty) return const [];
  final matches = _cl100kSplitPattern.allMatches(text);
  final tokens = <int>[];
  var id = 0;
  for (final m in matches) {
    final chunk = m.group(0)!;
    // Estimate sub-tokens per chunk based on character class.
    final subTokens = _estimateChunkTokens(chunk);
    for (var i = 0; i < subTokens; i++) {
      tokens.add(id++);
    }
  }
  // Fallback: if regex didn't match anything, approximate.
  if (tokens.isEmpty) {
    final est = _heuristicCount(text);
    for (var i = 0; i < est; i++) {
      tokens.add(i);
    }
  }
  return tokens;
}