encodeWithUnstable method

Tuple2<List<int>, Set<List<int>>> encodeWithUnstable(
  1. String text, {
  2. SpecialTokensSet allowedSpecial = const SpecialTokensSet.empty(),
  3. SpecialTokensSet disallowedSpecial = const SpecialTokensSet.all(),
})

Encodes a string into stable tokens and possible completion sequences.

Note that the stable tokens will only represent a substring of text.

See encode for more details on allowedSpecial and disallowedSpecial.

This API should itself be considered unstable.

final enc = getEncoding("gpt2"); // Get instance of encoder
enc.encodeWithUnstable("hello fanta") // Tuple2([31373], [(277, 4910), (5113, 265), ..., (8842,)])

final text = "hello";
final result = enc.encodeWithUnstable(text)
final stableTokens = result.i1, completions = result.i2;
assert(text.encode().startswith(enc.decode_bytes(stable_tokens)))
assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)

Implementation

Tuple2<List<int>, Set<List<int>>> encodeWithUnstable(
  String text, {
  SpecialTokensSet allowedSpecial = const SpecialTokensSet.empty(),
  SpecialTokensSet disallowedSpecial = const SpecialTokensSet.all(),
}) {
  final allowedSpecialSet =
      allowedSpecial.isAll ? specialTokensSet : allowedSpecial.set;

  final disallowedSpecialSet = disallowedSpecial.isAll
      ? specialTokensSet.difference(allowedSpecialSet)
      : disallowedSpecial.set;

  _verifyDisallowed(text, disallowedSpecialSet);

  return _coreBPE.encodeUnstableNative(text, allowedSpecialSet);
}