Skip to content

Commit ad3a271

Browse files
committed
[api-minor] Clear all caches in XRef.indexObjects, and improve /Root dictionary validation in XRef.parse (issue 14303)
*This patch improves handling of a couple of PDF documents from issue 14303.* - Update `XRef.indexObjects` to actually clear *all* XRef-caches. Invalid XRef tables *usually* cause issues early enough during parsing that we've not populated the XRef-cache, however to prevent any issues we obviously need to clear that one as well. - Improve the /Root dictionary validation in `XRef.parse` (PR 9827 follow-up). In addition to checking that a /Pages entry exists, we'll now also check that it can be successfully fetched *and* that it's of the correct type. There's really no point trying to use a /Root dictionary that e.g. `Catalog.toplevelPagesDict` will reject, and this way we'll be able to fallback to indexing the objects in corrupt documents. - Throw an `InvalidPDFException`, rather than a general `FormatError`, in `XRef.parse` when no usable /Root dictionary could be found. That really seems more appropriate overall, since all attempts at parsing/recovery have failed. (This part of the patch is API-observable, hence the tag.) With these changes, two existing test-cases are improved and the unit-tests are updated/re-factored to highlight that. In particular `GHOSTSCRIPT-698804-1-fuzzed.pdf` will now both load and "render" correctly, whereas `poppler-395-0-fuzzed.pdf` will now fail immediately upon loading (rather than *appearing* to work).
1 parent e9e4b91 commit ad3a271

File tree

2 files changed

+66
-43
lines changed

2 files changed

+66
-43
lines changed

src/core/xref.js

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,26 @@ class XRef {
107107
}
108108
warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
109109
}
110-
if (root instanceof Dict && root.has("Pages")) {
111-
this.root = root;
112-
} else {
113-
if (!recoveryMode) {
114-
throw new XRefParseException();
110+
if (root instanceof Dict) {
111+
try {
112+
const pages = root.get("Pages");
113+
if (pages instanceof Dict) {
114+
this.root = root;
115+
return;
116+
}
117+
} catch (ex) {
118+
if (ex instanceof MissingDataException) {
119+
throw ex;
120+
}
121+
warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
115122
}
116-
throw new FormatError("Invalid root reference");
117123
}
124+
125+
if (!recoveryMode) {
126+
throw new XRefParseException();
127+
}
128+
// Even recovery failed, there's nothing more we can do here.
129+
throw new InvalidPDFException("Invalid Root reference.");
118130
}
119131

120132
processXRefTable(parser) {
@@ -417,6 +429,7 @@ class XRef {
417429

418430
// Clear out any existing entries, since they may be bogus.
419431
this.entries.length = 0;
432+
this._cacheMap.clear();
420433

421434
const stream = this.stream;
422435
stream.pos = 0;

test/unit/api_spec.js

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ describe("api", function () {
445445
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
446446
});
447447

448-
it("creates pdf doc from PDF file with bad XRef table", async function () {
448+
it("creates pdf doc from PDF file with bad XRef entry", async function () {
449449
// A corrupt PDF file, where the XRef table have (some) bogus entries.
450450
const loadingTask = getDocument(
451451
buildGetDocumentParams("PDFBOX-4352-0.pdf", {
@@ -468,6 +468,26 @@ describe("api", function () {
468468
await loadingTask.destroy();
469469
});
470470

471+
it("creates pdf doc from PDF file with bad XRef header", async function () {
472+
const loadingTask = getDocument(
473+
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
474+
);
475+
expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
476+
477+
const pdfDocument = await loadingTask.promise;
478+
expect(pdfDocument.numPages).toEqual(1);
479+
480+
const page = await pdfDocument.getPage(1);
481+
expect(page instanceof PDFPageProxy).toEqual(true);
482+
483+
const opList = await page.getOperatorList();
484+
expect(opList.fnArray.length).toEqual(0);
485+
expect(opList.argsArray.length).toEqual(0);
486+
expect(opList.lastChunk).toEqual(true);
487+
488+
await loadingTask.destroy();
489+
});
490+
471491
it("creates pdf doc from PDF file with bad XRef byteWidths", async function () {
472492
// A corrupt PDF file, where the XRef /W-array have (some) bogus entries.
473493
const loadingTask = getDocument(
@@ -488,37 +508,49 @@ describe("api", function () {
488508
await loadingTask.destroy();
489509
});
490510

511+
it("creates pdf doc from PDF file with inaccessible /Pages tree", async function () {
512+
const loadingTask = getDocument(
513+
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
514+
);
515+
expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
516+
517+
try {
518+
await loadingTask.promise;
519+
520+
// Shouldn't get here.
521+
expect(false).toEqual(true);
522+
} catch (reason) {
523+
expect(reason instanceof InvalidPDFException).toEqual(true);
524+
expect(reason.message).toEqual("Invalid Root reference.");
525+
}
526+
527+
await loadingTask.destroy();
528+
});
529+
491530
it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
492531
const loadingTask1 = getDocument(
493532
buildGetDocumentParams("poppler-67295-0.pdf")
494533
);
495534
const loadingTask2 = getDocument(
496535
buildGetDocumentParams("poppler-85140-0.pdf")
497536
);
498-
const loadingTask3 = getDocument(
499-
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
500-
);
501-
const loadingTask4 = getDocument(
502-
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
503-
);
504537

505538
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
506539
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
507-
expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
508-
expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);
509540

510541
const pdfDocument1 = await loadingTask1.promise;
511542
const pdfDocument2 = await loadingTask2.promise;
512-
const pdfDocument3 = await loadingTask3.promise;
513-
const pdfDocument4 = await loadingTask4.promise;
514543

515544
expect(pdfDocument1.numPages).toEqual(1);
516545
expect(pdfDocument2.numPages).toEqual(1);
517-
expect(pdfDocument3.numPages).toEqual(1);
518-
expect(pdfDocument4.numPages).toEqual(1);
519546

520-
const pageA = await pdfDocument1.getPage(1);
521-
expect(pageA instanceof PDFPageProxy).toEqual(true);
547+
const page = await pdfDocument1.getPage(1);
548+
expect(page instanceof PDFPageProxy).toEqual(true);
549+
550+
const opList = await page.getOperatorList();
551+
expect(opList.fnArray.length).toBeGreaterThan(5);
552+
expect(opList.argsArray.length).toBeGreaterThan(5);
553+
expect(opList.lastChunk).toEqual(true);
522554

523555
try {
524556
await pdfDocument2.getPage(1);
@@ -529,28 +561,6 @@ describe("api", function () {
529561
expect(reason instanceof UnknownErrorException).toEqual(true);
530562
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
531563
}
532-
try {
533-
await pdfDocument3.getPage(1);
534-
535-
// Shouldn't get here.
536-
expect(false).toEqual(true);
537-
} catch (reason) {
538-
expect(reason instanceof UnknownErrorException).toEqual(true);
539-
expect(reason.message).toEqual(
540-
"Page dictionary kid reference points to wrong type of object."
541-
);
542-
}
543-
try {
544-
await pdfDocument4.getPage(1);
545-
546-
// Shouldn't get here.
547-
expect(false).toEqual(true);
548-
} catch (reason) {
549-
expect(reason instanceof UnknownErrorException).toEqual(true);
550-
expect(reason.message).toEqual(
551-
"Page dictionary kid reference points to wrong type of object."
552-
);
553-
}
554564

555565
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
556566
});

0 commit comments

Comments
 (0)