CoreBPE.create constructor

CoreBPE.create(
  1. Map<ByteArray, int> encoder,
  2. Map<String, int> specialTokensEncoder,
  3. String pattern
)

Implementation

factory CoreBPE.create(
  Map<ByteArray, int> encoder,
  Map<String, int> specialTokensEncoder,
  String pattern,
) {
  final regex = RegExp(pattern, unicode: true);

  final specialRegex = RegExp(
    specialTokensEncoder.keys.map(RegExp.escape).join("|"),
    unicode: true,
  );

  final decoder = HashMap.of(encoder.map((k, v) => MapEntry(v, k.clone())));
  assert(encoder.length == decoder.length);

  final specialTokensDecoder = HashMap.of(
    specialTokensEncoder.map((k, v) => MapEntry(v, ByteArray.fromList(utf8.encode(k)))),
  );

  final sortedTokenBytes = encoder.keys.toList();

  sortedTokenBytes.sort((a, b) => a.length.compareTo(b.length));

  return CoreBPE._internal(
    encoder: HashMap.from(encoder),
    specialTokensEncoder: HashMap.from(specialTokensEncoder),
    decoder: decoder,
    specialTokensDecoder: specialTokensDecoder,
    regex: regex,
    specialRegex: specialRegex,
    sortedTokenBytes: sortListOfUint8List(sortedTokenBytes) /*  sortedTokenBytes */,
  );
}