From 5b1c23c436ab7b383ca78ff9bcdc8f14452b3192 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 16 Apr 2025 11:13:49 +0200 Subject: [PATCH 1/2] :recycle: add fallback poppler mechanism to prevent default-encrypted XFA documents from getting bounced by pdf-lib --- .../invoiceSplitterExtractor.ts | 3 +- .../multiReceiptsExtractor.ts | 3 +- src/pdf/pdfCompressor.ts | 3 +- src/pdf/pdfOperation.ts | 43 ++++++++++++++++--- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts b/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts index fd8960314..0cc0dd191 100644 --- a/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts +++ b/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts @@ -3,6 +3,7 @@ import { MindeeError, MindeeMimeTypeError } from "../../errors"; import { InvoiceSplitterV1 } from "../../product"; import { LocalInputSource } from "../../input"; import { ExtractedInvoiceSplitterImage } from "./extractedInvoiceSplitterImage"; +import { loadPdfWithFallback } from "../../pdf/pdfOperation"; async function splitPdf(pdfDoc: PDFDocument, invoicePageGroups: number[][]): Promise { if (invoicePageGroups.length === 0) { @@ -35,7 +36,7 @@ async function getPdfDoc(inputFile: LocalInputSource): Promise { throw new MindeeMimeTypeError("Invoice Splitter is only compatible with pdf documents."); } - const pdfDoc = await PDFDocument.load(inputFile.fileObject); + const pdfDoc = await loadPdfWithFallback(inputFile.fileObject); if (pdfDoc.getPageCount() < 2) { throw new MindeeError("Invoice Splitter is only compatible with multi-page-pdf documents."); } diff --git a/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts b/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts index 5559d28c2..b46787a58 100644 --- a/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts +++ b/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts @@ -6,6 +6,7 @@ import { ExtractedMultiReceiptImage } from "./extractedMultiReceiptImage"; import { LocalInputSource } from "../../input"; import { extractFromPage } from "../common"; import { PositionField } from "../../parsing/standard"; +import { loadPdfWithFallback } from "../../pdf/pdfOperation"; /** * Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage @@ -37,7 +38,7 @@ async function loadPdfDoc(inputFile: LocalInputSource) { '" Currently supported types are .png, .jpg and .pdf' ); } else if (inputFile.isPdf()) { - pdfDoc = await PDFDocument.load(inputFile.fileObject); + pdfDoc = await loadPdfWithFallback(inputFile.fileObject); } else { pdfDoc = await PDFDocument.create(); let image: PDFImage; diff --git a/src/pdf/pdfCompressor.ts b/src/pdf/pdfCompressor.ts index 6a26972f1..b1c40c307 100644 --- a/src/pdf/pdfCompressor.ts +++ b/src/pdf/pdfCompressor.ts @@ -5,6 +5,7 @@ import * as fs from "node:fs"; import { Poppler } from "node-poppler"; import { PDFDocument, PDFFont, PDFPage, rgb, StandardFonts } from "pdf-lib"; import { compressImage } from "../imageOperations"; +import { loadPdfWithFallback } from "./pdfOperation"; /** * Compresses each page of a provided PDF buffer. @@ -128,7 +129,7 @@ async function compressPagesWithQuality( disableSourceText: boolean, extractedText: ExtractedPdfInfo | null ): Promise { - const pdfDoc = await PDFDocument.load(pdfData); + const pdfDoc = await loadPdfWithFallback(pdfData); const compressedPages: Buffer[] = []; for (let i = 0; i < extractedPdfInfo.pages.length; i++) { diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index 878d1c766..2c93c080b 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -3,12 +3,47 @@ import { PDFDocument } from "pdf-lib"; import { PageOptions, PageOptionsOperation } from "../input"; import { MindeeError } from "../errors"; import { logger } from "../logger"; +import { Poppler } from "node-poppler"; +import { readFile, writeFile } from "fs/promises"; +import tmp from "tmp"; +import fs from "node:fs"; export interface SplitPdf { file: Buffer; totalPagesRemoved: number; } +/** + * Attempts to load the file using pdf-lib, and falls back to node-poppler if unable to. + * @param file File buffer to be opened. + */ +export async function loadPdfWithFallback(file: string | Buffer) { + const document = await PDFDocument.load(file, { ignoreEncryption: true }); + if (!document.isEncrypted) { + return document; + } + const poppler = new Poppler(); + + const tmpPdfInput = tmp.fileSync(); + const tmpPdfInputPath = tmpPdfInput.name; + const tmpPdfOutput = tmp.fileSync(); + const tmpPdfOutputPath = tmpPdfOutput.name; + + try { + await writeFile(tmpPdfInputPath, file); + await poppler.pdfToCairo(tmpPdfInputPath, tmpPdfOutputPath, { + pdfFile: true, + antialias: "default", + }); + + const convertedPdf = await readFile(tmpPdfOutputPath); + return await PDFDocument.load(convertedPdf, { ignoreEncryption: true }); + } finally { + await fs.promises.unlink(tmpPdfInputPath); + await fs.promises.unlink(tmpPdfOutputPath);} +} + + /** * Cut pages from a pdf file. If pages index are out of bound, it will throw an error. * @param file @@ -19,9 +54,7 @@ export async function extractPages( file: Buffer, pageOptions: PageOptions ): Promise { - const currentPdf = await PDFDocument.load(file, { - ignoreEncryption: true, - }); + const currentPdf = await loadPdfWithFallback(file); const newPdf = await PDFDocument.create(); @@ -84,8 +117,6 @@ export async function extractPages( } export async function countPages(file: Buffer): Promise { - const currentPdf = await PDFDocument.load(file, { - ignoreEncryption: true, - }); + const currentPdf = await loadPdfWithFallback(file); return currentPdf.getPageCount(); } From 150c4248c82bb6c24373e62639918fa3a91998e9 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 16 Apr 2025 14:55:24 +0200 Subject: [PATCH 2/2] remove needless file operation --- src/pdf/pdfOperation.ts | 11 ++++------- tests/data | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index 2c93c080b..c6aead96a 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -4,7 +4,7 @@ import { PageOptions, PageOptionsOperation } from "../input"; import { MindeeError } from "../errors"; import { logger } from "../logger"; import { Poppler } from "node-poppler"; -import { readFile, writeFile } from "fs/promises"; +import { readFile } from "fs/promises"; import tmp from "tmp"; import fs from "node:fs"; @@ -24,14 +24,11 @@ export async function loadPdfWithFallback(file: string | Buffer) { } const poppler = new Poppler(); - const tmpPdfInput = tmp.fileSync(); - const tmpPdfInputPath = tmpPdfInput.name; const tmpPdfOutput = tmp.fileSync(); const tmpPdfOutputPath = tmpPdfOutput.name; try { - await writeFile(tmpPdfInputPath, file); - await poppler.pdfToCairo(tmpPdfInputPath, tmpPdfOutputPath, { + await poppler.pdfToCairo(file, tmpPdfOutputPath, { pdfFile: true, antialias: "default", }); @@ -39,8 +36,8 @@ export async function loadPdfWithFallback(file: string | Buffer) { const convertedPdf = await readFile(tmpPdfOutputPath); return await PDFDocument.load(convertedPdf, { ignoreEncryption: true }); } finally { - await fs.promises.unlink(tmpPdfInputPath); - await fs.promises.unlink(tmpPdfOutputPath);} + await fs.promises.unlink(tmpPdfOutputPath); + } } diff --git a/tests/data b/tests/data index 0c9cfe341..03c490671 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 0c9cfe3416babad79b0689b40eb71917ab996beb +Subproject commit 03c4906716cb29ab305da09f149c93ea322e8dd6