From bb7fb993996191d9ae71a7f4cf490e85516e1dee Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 12 Dec 2025 18:14:28 +0100 Subject: [PATCH 1/2] :sparkles: add multi-receipt custom file saving formats --- package-lock.json | 40 ++++++++++--- package.json | 2 +- src/imageOperations/common/extractedImage.ts | 60 ++++++++++++++++++- src/input/sources/localInputSource.ts | 2 +- .../api/multiReceiptsReconstruction.spec.ts | 45 +++++++++++--- 5 files changed, 128 insertions(+), 21 deletions(-) diff --git a/package-lock.json b/package-lock.json index 161054514..853f9dad3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,7 @@ "commander": "~9.4.1", "file-type": "~16.5.4", "form-data": "~3.0.1", - "node-poppler": "^7.2.2", + "node-poppler": "^9.0.1", "pdf.js-extract": "^0.2.1", "sharp": "^0.33.5", "tmp": "^0.2.3", @@ -71,9 +71,9 @@ } }, "node_modules/@emnapi/runtime": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz", - "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==", + "version": "1.7.1", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.7.1.tgz", + "integrity": "sha512-PVtJr5CmLwYAU9PZDMITZoR5iAOShYREoR45EyyLrbntV50mdePTgUn4AmOw90Ifcj+x2kRjdzr1HP3RrNiHGA==", "license": "MIT", "optional": true, "dependencies": { @@ -3062,16 +3062,38 @@ } }, "node_modules/node-poppler": { - "version": "7.2.4", - "resolved": "https://registry.npmjs.org/node-poppler/-/node-poppler-7.2.4.tgz", - "integrity": "sha512-+YvPbEQ2uxsUVBXDIZiUEu0C3wMOo37D+cIqzWu5gZra3p6MHhBFJtZ/2slzj+5QPmWcHd9SAEjXk2ogBSHg9Q==", + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/node-poppler/-/node-poppler-9.0.1.tgz", + "integrity": "sha512-lcfk/3cnUABmwpH3pOrjV1zieb+Wyh9hbEeVWTFusWBbVJBuFOBZquR8CuN4xP74eeYljq97O1GXP6TaZI/D3Q==", "license": "MIT", "dependencies": { "camelcase": "^6.3.0", - "semver": "^7.6.3" + "semver": "^7.7.2" }, "engines": { - "node": ">=18" + "node": ">=20" + }, + "funding": { + "url": "https://github.com/sponsors/Fdawgs" + }, + "optionalDependencies": { + "node-poppler-win32": "^1.0.1" + } + }, + "node_modules/node-poppler-win32": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/node-poppler-win32/-/node-poppler-win32-1.0.2.tgz", + "integrity": "sha512-U2YVdM7EEXNE21TD4ajJxfd1pKx7yEF2tcuHc4eCa7uxmw+mfSNPB+yHRtNrdxJqV6dKoz4Rvhuyy5rn6mg3ZA==", + "cpu": [ + "x64" + ], + "license": "GPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/Fdawgs" diff --git a/package.json b/package.json index 903f1dd32..1a81b8aa7 100644 --- a/package.json +++ b/package.json @@ -60,7 +60,7 @@ "commander": "~9.4.1", "file-type": "~16.5.4", "form-data": "~3.0.1", - "node-poppler": "^7.2.2", + "node-poppler": "^9.0.1", "pdf.js-extract": "^0.2.1", "sharp": "^0.33.5", "tmp": "^0.2.3", diff --git a/src/imageOperations/common/extractedImage.ts b/src/imageOperations/common/extractedImage.ts index 4452d752b..175dde35c 100644 --- a/src/imageOperations/common/extractedImage.ts +++ b/src/imageOperations/common/extractedImage.ts @@ -4,6 +4,9 @@ import { writeFileSync } from "node:fs"; import path from "node:path"; import { logger } from "../../logger"; import { BufferInput } from "../../input"; +import { MIMETYPES } from "../../input/sources/localInputSource"; +import { Poppler } from "node-poppler"; +import { writeFile } from "fs/promises"; /** * Generic class for image extraction @@ -23,9 +26,35 @@ export class ExtractedImage { * * @param outputPath Path to save the file to. */ - saveToFile(outputPath: string) { + async saveToFileAsync(outputPath: string) { + const fileExt = path.extname(outputPath).toLowerCase(); + if (!MIMETYPES.has(fileExt)) { + throw new MindeeError(`Unsupported file extension: ${fileExt}`); + } + try { - writeFileSync(path.resolve(outputPath), this.buffer); + let outputBuffer: Buffer = this.buffer; + if (fileExt !== ".pdf") { + const poppler = new Poppler(); + const options: Record = { + firstPageToConvert: 1, + lastPageToConvert: 1, + singleFile: true, + }; + + if (fileExt === ".png") { + options.pngFile = true; + } else if (fileExt === ".jpg" || fileExt === ".jpeg") { + options.jpegFile = true; + } else if (fileExt === ".tiff" || fileExt === ".tif") { + options.tiffFile = true; + } + + const result = await poppler.pdfToCairo(this.buffer, undefined, options); + outputBuffer = Buffer.from(result, "latin1"); + } + + await writeFile(path.resolve(outputPath), outputBuffer); logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); } catch (e) { if (e instanceof TypeError) { @@ -37,6 +66,33 @@ export class ExtractedImage { } + /** + * Attempts to saves the document to a file synchronously. + * Throws an error if the file extension is not supported or if the file could not be saved to disk for some reason. + * + * @param outputPath Path to save the file to. + */ + saveToFile(outputPath: string) { + const fileExt = path.extname(outputPath).toLowerCase(); + if (fileExt !== ".pdf") { + throw new MindeeError( + `Unsupported file extension: ${fileExt}. For image formats, use saveToFileAsync() instead.` + ); + } else { + try { + writeFileSync(path.resolve(outputPath), this.buffer); + logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); + } catch (e) { + if (e instanceof TypeError) { + throw new MindeeError("Invalid path/filename provided."); + } else { + throw e; + } + } + } + } + + /** * Return the file as a Mindee-compatible BufferInput source. * diff --git a/src/input/sources/localInputSource.ts b/src/input/sources/localInputSource.ts index 40c8f00fd..203b6c39e 100644 --- a/src/input/sources/localInputSource.ts +++ b/src/input/sources/localInputSource.ts @@ -15,7 +15,7 @@ import { INPUT_TYPE_PATH, INPUT_TYPE_BUFFER } from "./inputSource"; -const MIMETYPES = new Map([ +export const MIMETYPES = new Map([ [".pdf", "application/pdf"], [".heic", "image/heic"], [".jpg", "image/jpeg"], diff --git a/tests/v1/api/multiReceiptsReconstruction.spec.ts b/tests/v1/api/multiReceiptsReconstruction.spec.ts index a6150cd95..418e828da 100644 --- a/tests/v1/api/multiReceiptsReconstruction.spec.ts +++ b/tests/v1/api/multiReceiptsReconstruction.spec.ts @@ -1,24 +1,30 @@ import { expect } from "chai"; import { promises as fs } from "fs"; import * as path from "path"; -import { Document } from "../../../src"; +import { Document, PathInput } from "../../../src"; import { MultiReceiptsDetectorV1 } from "../../../src/product"; import { extractReceipts } from "../../../src/imageOperations"; -import { PathInput } from "../../../src"; -import { V1_PRODUCT_PATH } from "../../index"; +import { RESOURCE_PATH, V1_PRODUCT_PATH } from "../../index"; describe("MindeeV1 - A Multi-Receipt Document", () => { - it("should be split into the proper receipts", async () => { + let extractedReceipts: any[]; + let sourceDoc: PathInput; + before(async () => { + const jsonData = await fs.readFile( path.join(V1_PRODUCT_PATH, "multi_receipts_detector/response_v1/complete.json") ); - const sourceDoc = new PathInput( - { inputPath: path.join(V1_PRODUCT_PATH, "multi_receipts_detector/default_sample.jpg") } - ); + + sourceDoc = new PathInput({ + inputPath: path.join(V1_PRODUCT_PATH, "multi_receipts_detector/default_sample.jpg"), + }); await sourceDoc.init(); + const response = JSON.parse(jsonData.toString()); const doc = new Document(MultiReceiptsDetectorV1, response.document); - const extractedReceipts = await extractReceipts(sourceDoc, doc.inference); + extractedReceipts = await extractReceipts(sourceDoc, doc.inference); + }); + it("should be split into the proper receipts", async () => { expect(extractedReceipts.length).to.be.equals(6); let i = 0; for (const extractedReceipt of extractedReceipts) { @@ -29,4 +35,27 @@ describe("MindeeV1 - A Multi-Receipt Document", () => { i++; } }); + + it("should be saved locally", async () => { + let i = 0; + for (const extractedReceipt of extractedReceipts) { + extractedReceipt.saveToFile(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.pdf`)); + await extractedReceipt.saveToFileAsync(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.png`)); + await extractedReceipt.saveToFileAsync(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.jpg`)); + const pdfStat = await fs.stat(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.pdf`)); + expect(pdfStat.size).to.be.greaterThan(500000); // Arbitrary to assert noticeable discrepancies between OSs. + const jpgStat = await fs.stat(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.jpg`)); + expect(jpgStat.size).to.be.greaterThan(40000); + const pngStat = await fs.stat(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.png`)); + expect(pngStat.size).to.be.greaterThan(300000); + i++; + } + }).timeout(10000); + after(async () => { + for (let i = 0; i < extractedReceipts.length; i++) { + await fs.unlink(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.pdf`)); + await fs.unlink(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.jpg`)); + await fs.unlink(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.png`)); + } + }); }); From f4a9968d11cc35843d07b1b280991f4a53fa06e7 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:42:25 +0100 Subject: [PATCH 2/2] bump timeout value for local save test --- tests/v1/api/multiReceiptsReconstruction.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/api/multiReceiptsReconstruction.spec.ts b/tests/v1/api/multiReceiptsReconstruction.spec.ts index 418e828da..db7350777 100644 --- a/tests/v1/api/multiReceiptsReconstruction.spec.ts +++ b/tests/v1/api/multiReceiptsReconstruction.spec.ts @@ -50,7 +50,7 @@ describe("MindeeV1 - A Multi-Receipt Document", () => { expect(pngStat.size).to.be.greaterThan(300000); i++; } - }).timeout(10000); + }).timeout(20000); after(async () => { for (let i = 0; i < extractedReceipts.length; i++) { await fs.unlink(path.join(RESOURCE_PATH, `output/extracted_receipt_${i}.pdf`));