encodeUnstableNative method

_Tuple2<List<int>, Set<List<int>>> encodeUnstableNative(
  1. String text,
  2. Set<String> allowedSpecial
)

Implementation

_Tuple2<List<int>, Set<List<int>>> encodeUnstableNative(
  String text,
  Set<String> allowedSpecial,
) {
  final result = encodeNative(text, allowedSpecial);
  var tokens = [...result.i1];
  var lastPieceTokenLen = result.i2;

  if (lastPieceTokenLen == 0) return _Tuple2(tokens, {});

  var increasedLastPieceTokenLen =
      _increaseLastPieceTokenLen(tokens, lastPieceTokenLen);
  tokens = [...increasedLastPieceTokenLen.i1];
  lastPieceTokenLen = increasedLastPieceTokenLen.i2;

  final unstableBytes = decodeNative(tokens.sublist(tokens.length - lastPieceTokenLen));

  tokens.removeRange(tokens.length - lastPieceTokenLen, tokens.length);

  final completions = HashSet<List<int>>(
    equals: util.listEquals,
    hashCode: Object.hashAll,
  );

  if (unstableBytes.isEmpty) return _Tuple2(tokens, completions);

  var point = sortedTokenBytes.partitionPoint((p0) => p0 < unstableBytes);
  while (point < sortedTokenBytes.length &&
      sortedTokenBytes[point].startsWith(unstableBytes)) {
    completions.add([
      encoder[sortedTokenBytes[point]]!,
    ]);
    point++;
  }

  for (int i = 1; i < unstableBytes.length; i++) {
    final prefix = unstableBytes.sublist(0, i);
    final suffix = unstableBytes.sublist(i);

    point = sortedTokenBytes.partitionPoint((p0) => p0 < suffix);

    while (
        point < sortedTokenBytes.length && sortedTokenBytes[point].startsWith(suffix)) {
      var possibility = [...prefix.bytes, ...sortedTokenBytes[point].bytes];
      late List<int> encoded;
      try {
        encoded = encodeOrdinaryNative(utf8.decode(possibility));
      } catch (_) {
        encoded = util.bytePairEncode(ByteArray.fromList(possibility), encoder);
      }
      List<int> seq = [];
      int seqLen = 0;
      for (int token in encoded) {
        seq.add(token);
        seqLen += decoder[token]!.length;
        if (seqLen >= unstableBytes.length) {
          break;
        }
      }

      completions.add(seq);
      point++;
    }
  }

  if (unstableBytes.length > 1) {
    final last = decodeLastUtf8(unstableBytes.bytes);

    if (unstableBytes.length - last.i2 > 0 && isWhitespace(last.i1)) {
      final reencoded = util.bytePairEncode(
        unstableBytes.sublist(0, unstableBytes.length - last.i2),
        encoder,
      );
      reencoded.addAll(util.bytePairEncode(
        unstableBytes.sublist(unstableBytes.length - last.i2),
        encoder,
      ));
      completions.add(reencoded);
    }
  }

  return _Tuple2(tokens, completions);
}