BPE

import 'package:bpe/bpe.dart';

void main() {
  String something = "Some chars";
  
  BPETokenizer tokenizer = O200kBaseBPETokenizer();
  List<String> tokens = tokenizer.encode(something);
  String decoded = tokenizer.decode(tokens);
  
  // Estimate tokens without blowing up the heap by streaming in chunks
  String example =
      """Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.""" *
          100;
  
  print("CHAR is ${example.length}"); // 57400
  print("Real is ${tokenizer.encode(example).length}"); // 11400
  
  // Estimate the tokens efficiently ~99.57% accurate. Multiply count by 0.06 to ensure 
  // the estimate doesnt undershoot the actual token count (ceil)
  print("ESTI is ${await tokenizer.estimateTokens(example)}"); // 11449 (streams in chunks)
  
  // You can also feed real streams of strings to estimate tokens
  // It will only have a buffer of ~1024 to 2048 characters
  Future<int> estimateTokensStream(Stream.fromIterable([example]));
}

Libraries

bpe
tiktoken/src/common/byte_array
tiktoken/src/common/special_tokens_set
tiktoken/src/common/utils
tiktoken/src/core_bpe
tiktoken/src/core_bpe_constructor
tiktoken/src/error/tiktoken_error
tiktoken/src/ranks/cl100k_base.tiktoken
tiktoken/src/ranks/cl100k_base/cl100k_base_1.g
tiktoken/src/ranks/cl100k_base/cl100k_base_2.g
tiktoken/src/ranks/cl100k_base/cl100k_base_3.g
tiktoken/src/ranks/cl100k_base/cl100k_base_4.g
tiktoken/src/ranks/cl100k_base/cl100k_base_5.g
tiktoken/src/ranks/cl100k_base/cl100k_base_6.g
tiktoken/src/ranks/cl100k_base/cl100k_base_7.g
tiktoken/src/ranks/cl100k_base/cl100k_base_8.g
tiktoken/src/ranks/cl100k_base/cl100k_base_9.g
tiktoken/src/ranks/cl100k_base/cl100k_base_10.g
tiktoken/src/ranks/cl100k_base/cl100k_base_11.g
tiktoken/src/ranks/index
tiktoken/src/ranks/o200k_base.tiktoken
tiktoken/src/ranks/o200k_base/o200k_base_1
tiktoken/src/ranks/o200k_base/o200k_base_2
tiktoken/src/ranks/o200k_base/o200k_base_3
tiktoken/src/ranks/o200k_base/o200k_base_4
tiktoken/src/ranks/o200k_base/o200k_base_5
tiktoken/src/ranks/o200k_base/o200k_base_6
tiktoken/src/ranks/o200k_base/o200k_base_7
tiktoken/src/ranks/o200k_base/o200k_base_8
tiktoken/src/ranks/o200k_base/o200k_base_9
tiktoken/src/ranks/o200k_base/o200k_base_10
tiktoken/src/ranks/o200k_base/o200k_base_11
tiktoken/src/ranks/o200k_base/o200k_base_12
tiktoken/src/ranks/o200k_base/o200k_base_13
tiktoken/src/ranks/o200k_base/o200k_base_14
tiktoken/src/ranks/o200k_base/o200k_base_15
tiktoken/src/ranks/o200k_base/o200k_base_16
tiktoken/src/ranks/o200k_base/o200k_base_17
tiktoken/src/ranks/o200k_base/o200k_base_18
tiktoken/src/ranks/o200k_base/o200k_base_19
tiktoken/src/ranks/o200k_base/o200k_base_20
tiktoken/src/tiktoken_encoder
tiktoken/src/word_counter
tiktoken/tiktoken_tokenizer_gpt4o_o1