BaseTokenizer constructor
BaseTokenizer(})
Constructor for BaseTokenizer.
tokens
: List of tokens.
startingIndex
: Starting index for token IDs.
initToken
: Initial token for beginning of sequence.
eosToken
: Token for end of sequence.
padToken
: Token for padding.
unkToken
: Token for unknown elements.
Implementation
BaseTokenizer(List<String> tokens,
{int? startingIndex,
String initToken = "[CLS]",
String eosToken = "[SEP]",
String padToken = "[PAD]",
String unkToken = "[UNK]"}) {
startingIndex ??= 4;
padToken = padToken;
bosToken = initToken;
eosToken = eosToken;
unkToken = unkToken;
// Mapping tokens to their corresponding indices
for (int i = 0; i < tokens.length; i++) {
i2s[i + startingIndex] = tokens[i];
}
// Following the same ID scheme as JoeyNMT
i2s[0] = unkToken;
i2s[1] = padToken;
i2s[2] = bosToken;
i2s[3] = eosToken;
// Mapping strings to their corresponding indices
for (int i in i2s.keys) {
s2i[i2s[i]!] = i;
}
// Setting token IDs
padTokenId = s2i[padToken]!;
bosTokenId = s2i[bosToken]!;
eosTokenId = s2i[eosToken]!;
unkTokenId = s2i[unkToken]!;
}