extractInfo method
Implementation
PdfDocumentInfo extractInfo({int? maxPages}) {
_ensureXrefParsed();
final trailer = _trailerInfo ??
PdfParserXref.readTrailerInfoFromReader(reader, xrefOffset);
final rootObjId = trailer.rootObj;
if (rootObjId == null) {
return const PdfDocumentInfo(
version: '1.4',
pageCount: 0,
mediaBoxes: <PdfPageMediaBoxInfo>[],
images: <PdfImageInfo>[],
);
}
final rootObj = _getObjectNoStream(rootObjId) ?? _getObject(rootObjId);
if (rootObj == null || rootObj.value is! PdfDictToken) {
return const PdfDocumentInfo(
version: '1.4',
pageCount: 0,
mediaBoxes: <PdfPageMediaBoxInfo>[],
images: <PdfImageInfo>[],
);
}
final rootDict = rootObj.value as PdfDictToken;
final pagesRef =
PdfParserObjects.asRef(rootDict.values[PdfNameTokens.pages]);
var pageRefs = pagesRef != null
? _collectPageRefs(pagesRef, maxPages: maxPages)
: <PdfRefToken>[];
if ((_repairAttempted || pageRefs.isEmpty) && _allowRepair) {
pageRefs = _collectPageRefsByScan(maxPages: maxPages);
}
final mediaBoxes = <PdfPageMediaBoxInfo>[];
// Deduplicate by image object id across pages, matching mutool info behavior.
final images = <PdfImageInfo>[];
final seenImageRefs = <int>{};
for (int i = 0; i < pageRefs.length; i++) {
final pageRef = pageRefs[i];
final pageObj =
_getObjectNoStream(pageRef.obj) ?? _getObject(pageRef.obj);
if (pageObj == null || pageObj.value is! PdfDictToken) continue;
final pageDict = pageObj.value as PdfDictToken;
final mediaBox = _resolvePageMediaBox(pageDict);
if (mediaBox != null) {
mediaBoxes.add(PdfPageMediaBoxInfo(
pageIndex: i + 1,
pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
box: mediaBox,
));
}
final resDict = _resolvePageResources(pageDict);
final usedXObjects = _extractXObjectNamesFromContent(pageDict);
// Important: even in repair mode, try resource traversal first.
// Some real-world PDFs store page images behind Form XObjects. If we
// skip traversal and fallback to scan-only mapping, image/page pairing can
// become incorrect (content mismatch against mutool output).
_collectImagesFromResources(
resources: resDict,
pageIndex: i + 1,
pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
out: images,
seenImageRefs: seenImageRefs,
allowedTopLevelXObjects:
usedXObjects.isNotEmpty ? usedXObjects.toSet() : null,
);
}
// Last resort fallback: binary scan when page/resource traversal finds
// nothing. This should not be the primary path for repaired documents.
if (images.isEmpty && _allowRepair && pageRefs.isNotEmpty) {
images.addAll(extractImages(includeUnusedXObjects: true));
}
final infoMap =
trailer.infoObj != null ? _readInfoDict(trailer.infoObj!) : null;
final infoEntry =
trailer.infoObj != null ? _xrefEntries[trailer.infoObj!] : null;
return PdfDocumentInfo(
version: version.name.replaceAll('pdf_', '').replaceAll('_', '.'),
infoRef: trailer.infoObj != null
? PdfIndirectRef(trailer.infoObj!, infoEntry?.gen ?? 0)
: null,
infoDict: infoMap,
pageCount: pageRefs.length,
mediaBoxes: mediaBoxes,
images: images,
);
}