parseConlluSentences function

List<({String text, List<UDToken> tokens})> parseConlluSentences(
  1. String conllu
)

Parses a batch CoNLL-U string into per-sentence records.

Implementation

List<({String text, List<UDToken> tokens})> parseConlluSentences(String conllu) {
  final result = <({String text, List<UDToken> tokens})>[];
  String sentText = '';
  final current = <UDToken>[];

  void flush() {
    if (current.isNotEmpty) {
      result.add((text: sentText, tokens: List.unmodifiable(current)));
      current.clear();
      sentText = '';
    }
  }

  for (final line in conllu.split('\n')) {
    if (line.startsWith('# text = ')) {
      sentText = line.substring('# text = '.length).trim();
    } else if (line.isEmpty || line == '\r') {
      flush();
    } else if (!line.startsWith('#')) {
      final p = line.split('\t');
      if (p.length < 8) continue;
      if (p[0].contains('-') || p[0].contains('.')) continue;
      final id = int.tryParse(p[0]);
      if (id == null) continue;
      final f = UDToken.parseFeats(p[5]);
      current.add(UDToken(
        id:     id,
        form:   p[1],
        lemma:  p[2],
        upos:   p[3],
        deprel: p[7],
        head:   int.tryParse(p[6]) ?? 0,
        gender: f.gender,
        number: f.number,
        degree: f.degree,
      ));
    }
  }
  flush();
  return result;
}