parseConlluSentences function
Parses a batch CoNLL-U string into per-sentence records.
Implementation
List<({String text, List<UDToken> tokens})> parseConlluSentences(String conllu) {
final result = <({String text, List<UDToken> tokens})>[];
String sentText = '';
final current = <UDToken>[];
void flush() {
if (current.isNotEmpty) {
result.add((text: sentText, tokens: List.unmodifiable(current)));
current.clear();
sentText = '';
}
}
for (final line in conllu.split('\n')) {
if (line.startsWith('# text = ')) {
sentText = line.substring('# text = '.length).trim();
} else if (line.isEmpty || line == '\r') {
flush();
} else if (!line.startsWith('#')) {
final p = line.split('\t');
if (p.length < 8) continue;
if (p[0].contains('-') || p[0].contains('.')) continue;
final id = int.tryParse(p[0]);
if (id == null) continue;
final f = UDToken.parseFeats(p[5]);
current.add(UDToken(
id: id,
form: p[1],
lemma: p[2],
upos: p[3],
deprel: p[7],
head: int.tryParse(p[6]) ?? 0,
gender: f.gender,
number: f.number,
degree: f.degree,
));
}
}
flush();
return result;
}