extractText method - TextPage class - dart_mupdf library

extractText method

String extractText()

Extract plain text from the page.

Equivalent to PyMuPDF's textpage.extractText().

Implementation

String extractText() {
  if (_textRuns.isEmpty) return '';

  final sorted = List<_TextRun>.from(_textRuns)
    ..sort((a, b) {
      final dy = a.y.compareTo(b.y);
      if (dy != 0) return dy;
      return a.x.compareTo(b.x);
    });

  final buffer = StringBuffer();
  double lastY = double.negativeInfinity;
  double lastX = 0;
  final lineThreshold = 5.0;

  for (final run in sorted) {
    final text = run.text;
    if (text.isEmpty) continue;

    if ((run.y - lastY).abs() > lineThreshold) {
      // New line
      if (buffer.isNotEmpty) buffer.write('\n');
      buffer.write(text);
    } else {
      // Same line — check x gap to decide if space is needed
      final gap = run.x - lastX;
      final spaceWidth = run.fontSize * 0.25;
      if (gap > spaceWidth && !text.startsWith(' ')) {
        buffer.write(' ');
      }
      buffer.write(text);
    }
    lastY = run.y;
    lastX = run.x + run.approxWidth;
  }

  // PyMuPDF always appends a trailing newline
  buffer.write('\n');
  return buffer.toString();
}