diff --git a/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts b/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts index fd8960314..0cc0dd191 100644 --- a/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts +++ b/src/imageOperations/invoiceSplitterExtractor/invoiceSplitterExtractor.ts @@ -3,6 +3,7 @@ import { MindeeError, MindeeMimeTypeError } from "../../errors"; import { InvoiceSplitterV1 } from "../../product"; import { LocalInputSource } from "../../input"; import { ExtractedInvoiceSplitterImage } from "./extractedInvoiceSplitterImage"; +import { loadPdfWithFallback } from "../../pdf/pdfOperation"; async function splitPdf(pdfDoc: PDFDocument, invoicePageGroups: number[][]): Promise { if (invoicePageGroups.length === 0) { @@ -35,7 +36,7 @@ async function getPdfDoc(inputFile: LocalInputSource): Promise { throw new MindeeMimeTypeError("Invoice Splitter is only compatible with pdf documents."); } - const pdfDoc = await PDFDocument.load(inputFile.fileObject); + const pdfDoc = await loadPdfWithFallback(inputFile.fileObject); if (pdfDoc.getPageCount() < 2) { throw new MindeeError("Invoice Splitter is only compatible with multi-page-pdf documents."); } diff --git a/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts b/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts index 5559d28c2..b46787a58 100644 --- a/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts +++ b/src/imageOperations/multiReceiptsExtractor/multiReceiptsExtractor.ts @@ -6,6 +6,7 @@ import { ExtractedMultiReceiptImage } from "./extractedMultiReceiptImage"; import { LocalInputSource } from "../../input"; import { extractFromPage } from "../common"; import { PositionField } from "../../parsing/standard"; +import { loadPdfWithFallback } from "../../pdf/pdfOperation"; /** * Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage @@ -37,7 +38,7 @@ async function loadPdfDoc(inputFile: LocalInputSource) { '" Currently supported types are .png, .jpg and .pdf' ); } else if (inputFile.isPdf()) { - pdfDoc = await PDFDocument.load(inputFile.fileObject); + pdfDoc = await loadPdfWithFallback(inputFile.fileObject); } else { pdfDoc = await PDFDocument.create(); let image: PDFImage; diff --git a/src/pdf/pdfCompressor.ts b/src/pdf/pdfCompressor.ts index 6a26972f1..b1c40c307 100644 --- a/src/pdf/pdfCompressor.ts +++ b/src/pdf/pdfCompressor.ts @@ -5,6 +5,7 @@ import * as fs from "node:fs"; import { Poppler } from "node-poppler"; import { PDFDocument, PDFFont, PDFPage, rgb, StandardFonts } from "pdf-lib"; import { compressImage } from "../imageOperations"; +import { loadPdfWithFallback } from "./pdfOperation"; /** * Compresses each page of a provided PDF buffer. @@ -128,7 +129,7 @@ async function compressPagesWithQuality( disableSourceText: boolean, extractedText: ExtractedPdfInfo | null ): Promise { - const pdfDoc = await PDFDocument.load(pdfData); + const pdfDoc = await loadPdfWithFallback(pdfData); const compressedPages: Buffer[] = []; for (let i = 0; i < extractedPdfInfo.pages.length; i++) { diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index 878d1c766..c6aead96a 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -3,12 +3,44 @@ import { PDFDocument } from "pdf-lib"; import { PageOptions, PageOptionsOperation } from "../input"; import { MindeeError } from "../errors"; import { logger } from "../logger"; +import { Poppler } from "node-poppler"; +import { readFile } from "fs/promises"; +import tmp from "tmp"; +import fs from "node:fs"; export interface SplitPdf { file: Buffer; totalPagesRemoved: number; } +/** + * Attempts to load the file using pdf-lib, and falls back to node-poppler if unable to. + * @param file File buffer to be opened. + */ +export async function loadPdfWithFallback(file: string | Buffer) { + const document = await PDFDocument.load(file, { ignoreEncryption: true }); + if (!document.isEncrypted) { + return document; + } + const poppler = new Poppler(); + + const tmpPdfOutput = tmp.fileSync(); + const tmpPdfOutputPath = tmpPdfOutput.name; + + try { + await poppler.pdfToCairo(file, tmpPdfOutputPath, { + pdfFile: true, + antialias: "default", + }); + + const convertedPdf = await readFile(tmpPdfOutputPath); + return await PDFDocument.load(convertedPdf, { ignoreEncryption: true }); + } finally { + await fs.promises.unlink(tmpPdfOutputPath); + } +} + + /** * Cut pages from a pdf file. If pages index are out of bound, it will throw an error. * @param file @@ -19,9 +51,7 @@ export async function extractPages( file: Buffer, pageOptions: PageOptions ): Promise { - const currentPdf = await PDFDocument.load(file, { - ignoreEncryption: true, - }); + const currentPdf = await loadPdfWithFallback(file); const newPdf = await PDFDocument.create(); @@ -84,8 +114,6 @@ export async function extractPages( } export async function countPages(file: Buffer): Promise { - const currentPdf = await PDFDocument.load(file, { - ignoreEncryption: true, - }); + const currentPdf = await loadPdfWithFallback(file); return currentPdf.getPageCount(); } diff --git a/tests/data b/tests/data index 0c9cfe341..03c490671 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 0c9cfe3416babad79b0689b40eb71917ab996beb +Subproject commit 03c4906716cb29ab305da09f149c93ea322e8dd6