extractInfo method

PdfDocumentInfo extractInfo({
  1. int? maxPages,
})

Implementation

PdfDocumentInfo extractInfo({int? maxPages}) {
  _ensureXrefParsed();

  final trailer = _trailerInfo ??
      PdfParserXref.readTrailerInfoFromReader(reader, xrefOffset);
  final rootObjId = trailer.rootObj;
  if (rootObjId == null) {
    return const PdfDocumentInfo(
      version: '1.4',
      pageCount: 0,
      mediaBoxes: <PdfPageMediaBoxInfo>[],
      images: <PdfImageInfo>[],
    );
  }

  final rootObj = _getObjectNoStream(rootObjId) ?? _getObject(rootObjId);
  if (rootObj == null || rootObj.value is! PdfDictToken) {
    return const PdfDocumentInfo(
      version: '1.4',
      pageCount: 0,
      mediaBoxes: <PdfPageMediaBoxInfo>[],
      images: <PdfImageInfo>[],
    );
  }

  final rootDict = rootObj.value as PdfDictToken;
  final pagesRef =
      PdfParserObjects.asRef(rootDict.values[PdfNameTokens.pages]);
  var pageRefs = pagesRef != null
      ? _collectPageRefs(pagesRef, maxPages: maxPages)
      : <PdfRefToken>[];

  if ((_repairAttempted || pageRefs.isEmpty) && _allowRepair) {
    pageRefs = _collectPageRefsByScan(maxPages: maxPages);
  }

  final mediaBoxes = <PdfPageMediaBoxInfo>[];
  // Deduplicate by image object id across pages, matching mutool info behavior.
  final images = <PdfImageInfo>[];
  final seenImageRefs = <int>{};
  for (int i = 0; i < pageRefs.length; i++) {
    final pageRef = pageRefs[i];
    final pageObj =
        _getObjectNoStream(pageRef.obj) ?? _getObject(pageRef.obj);
    if (pageObj == null || pageObj.value is! PdfDictToken) continue;
    final pageDict = pageObj.value as PdfDictToken;

    final mediaBox = _resolvePageMediaBox(pageDict);
    if (mediaBox != null) {
      mediaBoxes.add(PdfPageMediaBoxInfo(
        pageIndex: i + 1,
        pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
        box: mediaBox,
      ));
    }

    final resDict = _resolvePageResources(pageDict);
    final usedXObjects = _extractXObjectNamesFromContent(pageDict);
    // Important: even in repair mode, try resource traversal first.
    // Some real-world PDFs store page images behind Form XObjects. If we
    // skip traversal and fallback to scan-only mapping, image/page pairing can
    // become incorrect (content mismatch against mutool output).
    _collectImagesFromResources(
      resources: resDict,
      pageIndex: i + 1,
      pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
      out: images,
      seenImageRefs: seenImageRefs,
      allowedTopLevelXObjects:
          usedXObjects.isNotEmpty ? usedXObjects.toSet() : null,
    );
  }

  // Last resort fallback: binary scan when page/resource traversal finds
  // nothing. This should not be the primary path for repaired documents.
  if (images.isEmpty && _allowRepair && pageRefs.isNotEmpty) {
    images.addAll(extractImages(includeUnusedXObjects: true));
  }

  final infoMap =
      trailer.infoObj != null ? _readInfoDict(trailer.infoObj!) : null;
  final infoEntry =
      trailer.infoObj != null ? _xrefEntries[trailer.infoObj!] : null;

  return PdfDocumentInfo(
    version: version.name.replaceAll('pdf_', '').replaceAll('_', '.'),
    infoRef: trailer.infoObj != null
        ? PdfIndirectRef(trailer.infoObj!, infoEntry?.gen ?? 0)
        : null,
    infoDict: infoMap,
    pageCount: pageRefs.length,
    mediaBoxes: mediaBoxes,
    images: images,
  );
}