encodeString method
A private member variable for caching a list of BPE (Byte Pair Encoding) merges.
Implementation
Future<SPTokenContainer> encodeString(String stringToEncode) async {
/// A function that encodes a given text string into a list of integer tokens.
/// It first encodes the string using UTF-8, applies BPE to the encoded string, and then maps each BPE token to an integer value using the encoding dictionary.
/// It returns an SPTokenContainer object that contains the encoded tokens, the number of tokens, and the number of characters in the original string.
_encoder ??= await _loadEncoder();
_bpeRanks ??= await _loadBpeRanks();
List<int> bpeTokens = [];
List<String?> matches =
pat.allMatches(stringToEncode).map((match) => match.group(0)).toList();
for (String? match in matches) {
if (match != null) {
List<int> encodedList = _encodeStr(match);
String token = encodedList.map((e) => byteEncoder[e]).join('');
String bpeToken = _bpe(token, _bpeRanks!);
List<String> charList = bpeToken.split(' ');
List<int>? newTokens = charList
.map((String x) {
if (_encoder!.containsKey(x)) {
return _encoder![x];
} else {
return _encoder!['!'];
}
})
.cast<int>()
.toList();
bpeTokens.addAll(newTokens);
}
}
return SPTokenContainer(
tokens: bpeTokens,
tokenCount: bpeTokens.length,
characterCount: stringToEncode.length,
);
}