encodeWithUnstable method
Tuple2<List<int> , Set<List<int> > >
encodeWithUnstable(
- String text, {
- SpecialTokensSet allowedSpecial = const SpecialTokensSet.empty(),
- SpecialTokensSet disallowedSpecial = const SpecialTokensSet.all(),
Encodes a string into stable tokens and possible completion sequences.
Note that the stable tokens will only represent a substring of text
.
See encode
for more details on allowedSpecial
and disallowedSpecial
.
This API should itself be considered unstable.
final enc = getEncoding("gpt2"); // Get instance of encoder
enc.encodeWithUnstable("hello fanta") // Tuple2([31373], [(277, 4910), (5113, 265), ..., (8842,)])
final text = "hello";
final result = enc.encodeWithUnstable(text)
final stableTokens = result.i1, completions = result.i2;
assert(text.encode().startswith(enc.decode_bytes(stable_tokens)))
assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)
Implementation
Tuple2<List<int>, Set<List<int>>> encodeWithUnstable(
String text, {
SpecialTokensSet allowedSpecial = const SpecialTokensSet.empty(),
SpecialTokensSet disallowedSpecial = const SpecialTokensSet.all(),
}) {
final allowedSpecialSet =
allowedSpecial.isAll ? specialTokensSet : allowedSpecial.set;
final disallowedSpecialSet = disallowedSpecial.isAll
? specialTokensSet.difference(allowedSpecialSet)
: disallowedSpecial.set;
_verifyDisallowed(text, disallowedSpecialSet);
return _coreBPE.encodeUnstableNative(text, allowedSpecialSet);
}