encode method
Encodes text into a list of token IDs.
Implementation
@override
List<int> encode(String text) {
if (text.isEmpty) return const [];
final matches = _cl100kSplitPattern.allMatches(text);
final tokens = <int>[];
var id = 0;
for (final m in matches) {
final chunk = m.group(0)!;
// Estimate sub-tokens per chunk based on character class.
final subTokens = _estimateChunkTokens(chunk);
for (var i = 0; i < subTokens; i++) {
tokens.add(id++);
}
}
// Fallback: if regex didn't match anything, approximate.
if (tokens.isEmpty) {
final est = _heuristicCount(text);
for (var i = 0; i < est; i++) {
tokens.add(i);
}
}
return tokens;
}