betto_icu 0.1.0-dev.1
betto_icu: ^0.1.0-dev.1 copied to clipboard
Unicode text tokenization for Dart — Tokenizer interface, IcuTokenizer (system ICU FFI, UAX #29), and RegExpTokenizer (pure Dart, Latin fallback).
example/betto_icu_example.dart
// Copyright 2026 The Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import 'package:betto_icu/betto_icu.dart';
void main() {
const text =
'"The Strange Case of Dr. Jekyll and Mr. Hyde" by Robert Louis Stevenson.';
final icu = IcuTokenizer();
print('IcuTokenizer: ${icu.tokenise(text)}');
final re = RegExpTokenizer();
print('RegExpTokenizer: ${re.tokenise(text)}');
// On web targets, BrowserTokenizer provides UAX #29 quality segmentation
// via the browser's built-in Intl.Segmenter API — zero bundle cost, no FFI.
// Requires Chrome 87+, Firefox 125+, or Safari 16.4+.
//
// final browser = BrowserTokenizer();
// print('BrowserTokenizer: ${browser.tokenise(text)}');
}