extractImages method

List<PdfImageInfo> extractImages({
  1. int? fromPage,
  2. int? toPage,
  3. bool includeUnusedXObjects = false,
})

Extrai imagens do PDF com suporte a faixa de páginas.

Se includeUnusedXObjects for true, considera todos os XObjects da página, sem checar se foram usados no content stream.

Implementation

List<PdfImageInfo> extractImages({
  int? fromPage,
  int? toPage,
  bool includeUnusedXObjects = false,
}) {
  _ensureXrefParsed();

  final trailer = _trailerInfo ??
      PdfParserXref.readTrailerInfoFromReader(reader, xrefOffset);
  final rootObjId = trailer.rootObj;
  if (rootObjId == null) return const <PdfImageInfo>[];

  final rootObj = _getObjectNoStream(rootObjId) ?? _getObject(rootObjId);
  if (rootObj == null || rootObj.value is! PdfDictToken) {
    return const <PdfImageInfo>[];
  }

  final rootDict = rootObj.value as PdfDictToken;
  final pagesRef =
      PdfParserObjects.asRef(rootDict.values[PdfNameTokens.pages]);
  var pageRefs = pagesRef != null
      ? _collectPageRefs(pagesRef, maxPages: toPage)
      : <PdfRefToken>[];

  if ((_repairAttempted || pageRefs.isEmpty) && _allowRepair) {
    pageRefs = _collectPageRefsByScan(maxPages: toPage);
  }

  // Deduplicate by image object id across pages, matching mutool info behavior.
  final images = <PdfImageInfo>[];
  final seenImageRefs = <int>{};

  for (int i = 0; i < pageRefs.length; i++) {
    final pageIndex = i + 1;
    if (fromPage != null && pageIndex < fromPage) continue;
    if (toPage != null && pageIndex > toPage) continue;

    final pageRef = pageRefs[i];
    final pageObj =
        _getObjectNoStream(pageRef.obj) ?? _getObject(pageRef.obj);
    if (pageObj == null || pageObj.value is! PdfDictToken) continue;
    final pageDict = pageObj.value as PdfDictToken;

    final resDict = _resolvePageResources(pageDict);
    final usedXObjects = includeUnusedXObjects
        ? null
        : _extractXObjectNamesFromContent(pageDict).toSet();
    _collectImagesFromResources(
      resources: resDict,
      pageIndex: pageIndex,
      pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
      out: images,
      seenImageRefs: seenImageRefs,
      allowedTopLevelXObjects: usedXObjects,
    );
  }

  // Last resort fallback for damaged structures where resources cannot be
  // resolved. Mapping by index is best-effort and less reliable than
  // resource traversal.
  if (images.isEmpty && _allowRepair && pageRefs.isNotEmpty) {
    final scanned = _collectImagesByScan();
    if (scanned.length == pageRefs.length) {
      for (var i = 0; i < scanned.length; i++) {
        final pageIndex = i + 1;
        if (fromPage != null && pageIndex < fromPage) continue;
        if (toPage != null && pageIndex > toPage) continue;
        final pageRef = pageRefs[i];
        final img = scanned[i];
        images.add(PdfImageInfo(
          pageIndex: pageIndex,
          pageRef: PdfIndirectRef(pageRef.obj, pageRef.gen),
          imageRef: img.imageRef,
          width: img.width,
          height: img.height,
          bitsPerComponent: img.bitsPerComponent,
          colorSpace: img.colorSpace,
          filter: img.filter,
        ));
      }
    } else {
      final firstPageRef = pageRefs.first;
      for (var i = 0; i < scanned.length; i++) {
        final pageIndex = i + 1;
        if (fromPage != null && pageIndex < fromPage) continue;
        if (toPage != null && pageIndex > toPage) continue;
        final img = scanned[i];
        images.add(PdfImageInfo(
          pageIndex: pageIndex,
          pageRef: PdfIndirectRef(firstPageRef.obj, firstPageRef.gen),
          imageRef: img.imageRef,
          width: img.width,
          height: img.height,
          bitsPerComponent: img.bitsPerComponent,
          colorSpace: img.colorSpace,
          filter: img.filter,
        ));
      }
    }
  }

  return images;
}