encodeUnstableNative method
Implementation
_Tuple2<List<int>, Set<List<int>>> encodeUnstableNative(
String text,
Set<String> allowedSpecial,
) {
final result = encodeNative(text, allowedSpecial);
var tokens = [...result.i1];
var lastPieceTokenLen = result.i2;
if (lastPieceTokenLen == 0) return _Tuple2(tokens, {});
var increasedLastPieceTokenLen =
_increaseLastPieceTokenLen(tokens, lastPieceTokenLen);
tokens = [...increasedLastPieceTokenLen.i1];
lastPieceTokenLen = increasedLastPieceTokenLen.i2;
final unstableBytes = decodeNative(tokens.sublist(tokens.length - lastPieceTokenLen));
tokens.removeRange(tokens.length - lastPieceTokenLen, tokens.length);
final completions = HashSet<List<int>>(
equals: util.listEquals,
hashCode: Object.hashAll,
);
if (unstableBytes.isEmpty) return _Tuple2(tokens, completions);
var point = sortedTokenBytes.partitionPoint((p0) => p0 < unstableBytes);
while (point < sortedTokenBytes.length &&
sortedTokenBytes[point].startsWith(unstableBytes)) {
completions.add([
encoder[sortedTokenBytes[point]]!,
]);
point++;
}
for (int i = 1; i < unstableBytes.length; i++) {
final prefix = unstableBytes.sublist(0, i);
final suffix = unstableBytes.sublist(i);
point = sortedTokenBytes.partitionPoint((p0) => p0 < suffix);
while (
point < sortedTokenBytes.length && sortedTokenBytes[point].startsWith(suffix)) {
var possibility = [...prefix.bytes, ...sortedTokenBytes[point].bytes];
late List<int> encoded;
try {
encoded = encodeOrdinaryNative(utf8.decode(possibility));
} catch (_) {
encoded = util.bytePairEncode(ByteArray.fromList(possibility), encoder);
}
List<int> seq = [];
int seqLen = 0;
for (int token in encoded) {
seq.add(token);
seqLen += decoder[token]!.length;
if (seqLen >= unstableBytes.length) {
break;
}
}
completions.add(seq);
point++;
}
}
if (unstableBytes.length > 1) {
final last = decodeLastUtf8(unstableBytes.bytes);
if (unstableBytes.length - last.i2 > 0 && isWhitespace(last.i1)) {
final reencoded = util.bytePairEncode(
unstableBytes.sublist(0, unstableBytes.length - last.i2),
encoder,
);
reencoded.addAll(util.bytePairEncode(
unstableBytes.sublist(unstableBytes.length - last.i2),
encoder,
));
completions.add(reencoded);
}
}
return _Tuple2(tokens, completions);
}