encodeString method

Future<SPTokenContainer> encodeString(
  1. String stringToEncode
)

A private member variable for caching a list of BPE (Byte Pair Encoding) merges.

Implementation

Future<SPTokenContainer> encodeString(String stringToEncode) async {
  /// A function that encodes a given text string into a list of integer tokens.
  /// It first encodes the string using UTF-8, applies BPE to the encoded string, and then maps each BPE token to an integer value using the encoding dictionary.
  /// It returns an SPTokenContainer object that contains the encoded tokens, the number of tokens, and the number of characters in the original string.
  _encoder ??= await _loadEncoder();
  _bpeRanks ??= await _loadBpeRanks();

  List<int> bpeTokens = [];
  List<String?> matches =
      pat.allMatches(stringToEncode).map((match) => match.group(0)).toList();
  for (String? match in matches) {
    if (match != null) {
      List<int> encodedList = _encodeStr(match);
      String token = encodedList.map((e) => byteEncoder[e]).join('');
      String bpeToken = _bpe(token, _bpeRanks!);
      List<String> charList = bpeToken.split(' ');

      List<int>? newTokens = charList
          .map((String x) {
            if (_encoder!.containsKey(x)) {
              return _encoder![x];
            } else {
              return _encoder!['!'];
            }
          })
          .cast<int>()
          .toList();
      bpeTokens.addAll(newTokens);
    }
  }

  return SPTokenContainer(
    tokens: bpeTokens,
    tokenCount: bpeTokens.length,
    characterCount: stringToEncode.length,
  );
}