diff --git a/src/clientV2.ts b/src/clientV2.ts index b04f43b7e..9255b2191 100644 --- a/src/clientV2.ts +++ b/src/clientV2.ts @@ -1,4 +1,4 @@ -import { InputSource } from "./input"; +import { DataSchema, InputSource } from "./input"; import { errorHandler } from "./errors/handler"; import { LOG_LEVELS, logger } from "./logger"; @@ -6,6 +6,7 @@ import { setTimeout } from "node:timers/promises"; import { ErrorResponse, InferenceResponse, JobResponse } from "./parsing/v2"; import { MindeeApiV2 } from "./http/mindeeApiV2"; import { MindeeHttpErrorV2 } from "./errors/mindeeError"; +import { StringDict } from "./parsing/common"; /** * Parameters for the internal polling loop in {@link ClientV2.enqueueAndGetInference | enqueueAndGetInference()} . @@ -102,6 +103,11 @@ export interface InferenceParameters { /** By default, the file is closed once the upload is finished. * Set to `false` to keep it open. */ closeFile?: boolean; + /** + * Dynamic changes to the data schema of the model for this inference. + * Not recommended, for specific use only. + */ + dataSchema?: DataSchema|StringDict|string; } /** @@ -152,6 +158,18 @@ export class ClientV2 { logger.debug("ClientV2 initialized"); } + /** + * Checks the Data Schema. + * @param params Input Inference parameters. + */ + validateDataSchema(params: InferenceParameters): void { + if (params.dataSchema !== undefined && params.dataSchema !== null){ + if (!(params.dataSchema instanceof DataSchema)){ + params.dataSchema = new DataSchema(params.dataSchema); + } + } + } + /** * Send the document to an asynchronous endpoint and return its ID in the queue. * @param inputSource file or URL to parse. @@ -166,7 +184,9 @@ export class ClientV2 { if (inputSource === undefined) { throw new Error("The 'enqueue' function requires an input document."); } + this.validateDataSchema(params); await inputSource.init(); + return await this.mindeeApi.reqPostInferenceEnqueue(inputSource, params); } diff --git a/src/http/mindeeApiV2.ts b/src/http/mindeeApiV2.ts index d8a8148eb..b918b9f2d 100644 --- a/src/http/mindeeApiV2.ts +++ b/src/http/mindeeApiV2.ts @@ -113,6 +113,9 @@ export class MindeeApiV2 { if (params.textContext !== undefined && params.textContext !== null) { form.append("text_context", params.textContext); } + if (params.dataSchema !== undefined && params.dataSchema !== null) { + form.append("data_schema", params.dataSchema.toString()); + } if (params.webhookIds && params.webhookIds.length > 0) { form.append("webhook_ids", params.webhookIds.join(",")); } diff --git a/src/input/dataSchema.ts b/src/input/dataSchema.ts new file mode 100644 index 000000000..7d033ebf7 --- /dev/null +++ b/src/input/dataSchema.ts @@ -0,0 +1,142 @@ +import { StringDict } from "../parsing/common"; +import { MindeeError } from "../errors"; + +export class DataSchemaField { + /** + * Display name for the field, also impacts inference results. + */ + public title: string; + + /** + * Name of the field in the data schema. + */ + public name: string; + + /** + * Whether this field can contain multiple values. + */ + public isArray: boolean; + + /** + * Data type of the field. + */ + public type: string; + + /** + * Allowed values when type is `classification`. Leave empty for other types. + */ + public classificationValues?: Array; + + /** + * Whether to remove duplicate values in the array. + * Only applicable if `is_array` is True. + */ + public uniqueValues?: boolean; + + /** + * Detailed description of what this field represents. + */ + public description?: string; + + /** + * Optional extraction guidelines. + */ + public guidelines?: string; + + /** + * Subfields when type is `nested_object`. Leave empty for other types. + */ + public nestedFields?: StringDict; + + constructor(fields: StringDict) { + this.name = fields["name"]; + this.title = fields["title"]; + this.isArray = fields["is_array"]; + this.type = fields["type"]; + this.classificationValues = fields["classification_values"]; + this.uniqueValues = fields["unique_values"]; + this.description = fields["description"]; + this.guidelines = fields["guidelines"]; + this.nestedFields = fields["nested_fields"]; + } + + toJSON() { + const out: Record = { + name: this.name, + title: this.title, + // eslint-disable-next-line @typescript-eslint/naming-convention,camelcase + is_array: this.isArray, + type: this.type, + }; + + // eslint-disable-next-line camelcase + if (this.classificationValues !== undefined) out.classification_values = this.classificationValues; + // eslint-disable-next-line camelcase + if (this.uniqueValues !== undefined) out.unique_values = this.uniqueValues; + if (this.description !== undefined) out.description = this.description; + if (this.guidelines !== undefined) out.guidelines = this.guidelines; + // eslint-disable-next-line camelcase + if (this.nestedFields !== undefined) out.nested_fields = this.nestedFields; + + return out; + } + + toString() { + return JSON.stringify(this.toJSON()); + } +} + +/** + * The structure to completely replace the data schema of the model. + */ +export class DataSchemaReplace { + /** + * List of fields in the Data Schema. + */ + fields: Array; + + constructor(dataSchemaReplace: StringDict) { + if (!dataSchemaReplace || !dataSchemaReplace.fields ) { + throw new MindeeError("Invalid Data Schema provided."); + } + if (dataSchemaReplace["fields"].length === 0) { + throw new TypeError("Data Schema replacement fields cannot be empty."); + } + this.fields = dataSchemaReplace["fields"].map((field: StringDict) => (new DataSchemaField(field))); + } + + toJSON() { + return { fields: this.fields.map(e => e.toJSON()) }; + } + + toString() { + return JSON.stringify(this.toJSON()); + } +} + +/** + * Modify the Data Schema. + */ +export class DataSchema { + /** + * If set, completely replaces the data schema of the model. + */ + replace?: DataSchemaReplace; + + constructor(dataSchema: StringDict | string) { + if (typeof dataSchema === "string") { + this.replace = new DataSchemaReplace(JSON.parse(dataSchema)["replace"]); + } else if (dataSchema["replace"] instanceof DataSchemaReplace) { + this.replace = dataSchema["replace"]; + } else { + this.replace = new DataSchemaReplace(dataSchema["replace"] as StringDict); + } + } + + toJSON() { + return { replace: this.replace?.toJSON() }; + } + toString() { + return JSON.stringify(this.toJSON()); + } +} diff --git a/src/input/index.ts b/src/input/index.ts index 6b2d4ec38..3b8e1e66b 100644 --- a/src/input/index.ts +++ b/src/input/index.ts @@ -1,3 +1,4 @@ -export { PageOptions, PageOptionsOperation } from "./pageOptions"; +export { DataSchema, DataSchemaField, DataSchemaReplace } from "./dataSchema"; export * from "./sources"; export { LocalResponse } from "./localResponse"; +export { PageOptions, PageOptionsOperation } from "./pageOptions"; diff --git a/src/parsing/v2/dataSchemaActiveOption.ts b/src/parsing/v2/dataSchemaActiveOption.ts new file mode 100644 index 000000000..ded792710 --- /dev/null +++ b/src/parsing/v2/dataSchemaActiveOption.ts @@ -0,0 +1,19 @@ +import { StringDict } from "../common"; + +/** + * Data schema options activated during the inference. + */ +export class DataSchemaActiveOption { + /** + * Whether to replace the data schema. + */ + replace: boolean; + + constructor(serverResponse: StringDict) { + this.replace = serverResponse["replace"]; + } + + toString() { + return `Data Schema\n-----------\n:Replace: ${this.replace? "True" : "False"}`; + } +} diff --git a/src/parsing/v2/inferenceActiveOptions.ts b/src/parsing/v2/inferenceActiveOptions.ts index 34099b55c..5223dd021 100644 --- a/src/parsing/v2/inferenceActiveOptions.ts +++ b/src/parsing/v2/inferenceActiveOptions.ts @@ -1,4 +1,5 @@ import { StringDict } from "../common"; +import { DataSchemaActiveOption } from "./dataSchemaActiveOption"; export class InferenceActiveOptions { /** @@ -23,12 +24,18 @@ export class InferenceActiveOptions { */ public textContext: boolean; + /** + * Data schema options provided for the inference. + */ + public dataSchema: DataSchemaActiveOption; + constructor(serverResponse: StringDict) { this.rag = serverResponse["rag"]; this.rawText = serverResponse["raw_text"]; this.polygon = serverResponse["polygon"]; this.confidence = serverResponse["confidence"]; this.textContext = serverResponse["text_context"]; + this.dataSchema = new DataSchemaActiveOption(serverResponse["data_schema"]); } toString(): string { @@ -37,6 +44,8 @@ export class InferenceActiveOptions { `:Raw Text: ${this.rawText ? "True" : "False"}\n` + `:Polygon: ${this.polygon ? "True" : "False"}\n` + `:Confidence: ${this.confidence ? "True" : "False"}\n` + - `:RAG: ${this.rag ? "True" : "False"}\n`; + `:RAG: ${this.rag ? "True" : "False"}\n` + + `:Text Context: ${this.textContext ? "True" : "False"}\n\n` + + `${this.dataSchema}\n`; } } diff --git a/tests/data b/tests/data index 14b89c741..0c51e1d3e 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 14b89c741ed357f6a88c1ac0d203c7ef309e77ef +Subproject commit 0c51e1d3e2258404c44280f25f4951ba6fe27324 diff --git a/tests/v2/clientV2.integration.ts b/tests/v2/clientV2.integration.ts index 4cacae598..4583d90d8 100644 --- a/tests/v2/clientV2.integration.ts +++ b/tests/v2/clientV2.integration.ts @@ -13,7 +13,26 @@ import { Inference } from "../../src/parsing/v2"; import { SimpleField } from "../../src/parsing/v2/field"; import { MindeeHttpErrorV2 } from "../../src/errors/mindeeError"; import * as fs from "node:fs"; -import { RESOURCE_PATH, V2_PRODUCT_PATH } from "../index"; +import { RESOURCE_PATH, V2_PRODUCT_PATH, V2_RESOURCE_PATH } from "../index"; + +function check422(err: unknown) { + expect(err).to.be.instanceOf(MindeeHttpErrorV2); + const errObj = err as MindeeHttpErrorV2; + expect(errObj.status).to.equal(422); + expect(errObj.code.startsWith("422-")).to.be.true; + expect(errObj.title).to.be.a("string"); + expect(errObj.detail).to.be.a("string"); + expect(errObj.errors).to.be.instanceOf(Array); +} + +function checkEmptyActiveOptions(inference: Inference) { + expect(inference.activeOptions).to.not.be.null; + expect(inference.activeOptions?.rag).to.be.false; + expect(inference.activeOptions?.rawText).to.be.false; + expect(inference.activeOptions?.polygon).to.be.false; + expect(inference.activeOptions?.confidence).to.be.false; + expect(inference.activeOptions?.textContext).to.be.false; +} describe("MindeeV2 – Client Integration Tests", () => { let client: ClientV2; @@ -35,6 +54,10 @@ describe("MindeeV2 – Client Integration Tests", () => { "file_types", "receipt.txt", ); + const dataSchemaReplacePath = path.join( + V2_RESOURCE_PATH, "inference/data_schema_replace_param.json" + ); + let dataSchemaReplace: string; beforeEach(async () => { const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; @@ -42,6 +65,9 @@ describe("MindeeV2 – Client Integration Tests", () => { client = new ClientV2({ apiKey }); }); + before(async () => { + dataSchemaReplace = fs.readFileSync(dataSchemaReplacePath).toString(); + }); it("Empty, multi-page PDF – PathInput - enqueueAndGetInference must succeed", async () => { const source = new PathInput({ inputPath: emptyPdfPath }); @@ -67,12 +93,7 @@ describe("MindeeV2 – Client Integration Tests", () => { expect(inference.result).to.exist; expect(inference.result.rawText).to.be.undefined; - expect(inference.activeOptions).to.not.be.null; - expect(inference.activeOptions?.rag).to.be.false; - expect(inference.activeOptions?.rawText).to.be.false; - expect(inference.activeOptions?.polygon).to.be.false; - expect(inference.activeOptions?.confidence).to.be.false; - expect(inference.activeOptions?.textContext).to.be.false; + checkEmptyActiveOptions(inference); }).timeout(60000); it("Filled, single-page image – PathInput - enqueueAndGetInference must succeed", async () => { @@ -140,12 +161,7 @@ describe("MindeeV2 – Client Integration Tests", () => { expect(supplierField).to.be.instanceOf(SimpleField); expect(supplierField.value).to.equal("Clachan"); - expect(inference.activeOptions).to.not.be.null; - expect(inference.activeOptions?.rag).to.be.false; - expect(inference.activeOptions?.rawText).to.be.false; - expect(inference.activeOptions?.polygon).to.be.false; - expect(inference.activeOptions?.confidence).to.be.false; - expect(inference.activeOptions?.textContext).to.be.false; + checkEmptyActiveOptions(inference); }).timeout(120000); it("Invalid model ID – enqueue must raise 422", async () => { @@ -156,13 +172,7 @@ describe("MindeeV2 – Client Integration Tests", () => { await client.enqueueInference(source, badParams); expect.fail("Expected the call to throw, but it succeeded."); } catch (err) { - expect(err).to.be.instanceOf(MindeeHttpErrorV2); - const errObj = err as MindeeHttpErrorV2; - expect(errObj.status).to.equal(422); - expect(errObj.code.startsWith("422-")).to.be.true; - expect(errObj.title).to.be.a("string"); - expect(errObj.detail).to.be.a("string"); - expect(errObj.errors).to.be.instanceOf(Array); + check422(err); } }).timeout(60000); @@ -171,13 +181,7 @@ describe("MindeeV2 – Client Integration Tests", () => { await client.getInference("00000000-0000-0000-0000-000000000000"); expect.fail("Expected the call to throw, but it succeeded."); } catch (err) { - expect(err).to.be.instanceOf(MindeeHttpErrorV2); - const errObj = err as MindeeHttpErrorV2; - expect(errObj.status).to.equal(422); - expect(errObj.code.startsWith("422-")).to.be.true; - expect(errObj.title).to.be.a("string"); - expect(errObj.detail).to.be.a("string"); - expect(errObj.errors).to.be.instanceOf(Array); + check422(err); } }).timeout(60000); @@ -200,4 +204,25 @@ describe("MindeeV2 – Client Integration Tests", () => { expect(response.inference).to.be.instanceOf(Inference); }).timeout(60000); + it("Data Schema Override - Overrides the data schema successfully", async () => { + const source = new PathInput({ inputPath: emptyPdfPath }); + const params: InferenceParameters = { + modelId, + rag: false, + rawText: false, + confidence: false, + polygon: false, + webhookIds: [], + dataSchema: dataSchemaReplace, + alias: "ts_integration_data_schema_replace" + }; + const response = await client.enqueueAndGetInference(source, params); + + expect(response).to.exist; + expect(response.inference).to.be.instanceOf(Inference); + expect(response.inference.result.fields.get("test_replace")).to.exist; + expect((response.inference.result.fields.get("test_replace") as SimpleField).value).to.be.equals("a test value"); + + }).timeout(60000); + }); diff --git a/tests/v2/input/inferenceParameter.spec.ts b/tests/v2/input/inferenceParameter.spec.ts new file mode 100644 index 000000000..cad3056c7 --- /dev/null +++ b/tests/v2/input/inferenceParameter.spec.ts @@ -0,0 +1,48 @@ +import { StringDict } from "../../../src/parsing/common"; +import path from "path"; +import { V2_RESOURCE_PATH } from "../../index"; +import { InferenceParameters } from "../../../src"; +import { expect } from "chai"; +import { DataSchema } from "../../../src/input"; +import { promises as fs } from "fs"; + +let expectedDataSchemaDict: StringDict; +let expectedDataSchemaString: string; +let expectedDataSchemaObject: DataSchema; + +describe("MindeeV2 - Inference Parameter", () => { + before(async () => { + const fileContents = await fs.readFile(path.join(V2_RESOURCE_PATH, "inference/data_schema_replace_param.json")); + expectedDataSchemaDict = JSON.parse(fileContents.toString()); + expectedDataSchemaString = JSON.stringify(expectedDataSchemaDict); + expectedDataSchemaObject = new DataSchema(expectedDataSchemaDict); + }); + + it("shouldn't replace when unset", async () => { + const params: InferenceParameters = { + modelId: "test-model-id", + }; + + expect(params.dataSchema).to.be.undefined; + }); + + it("should equate no matter the type", async () => { + const paramsDict: InferenceParameters = { + modelId: "test-model-id", + dataSchema: expectedDataSchemaDict, + }; + const paramsString: InferenceParameters = { + modelId: "test-model-id", + dataSchema: expectedDataSchemaString, + }; + const paramsObject: InferenceParameters = { + modelId: "test-model-id", + dataSchema: expectedDataSchemaObject, + }; + + expect(JSON.stringify(paramsDict.dataSchema)).to.eq(expectedDataSchemaString); + expect(paramsObject.dataSchema?.toString()).to.eq(expectedDataSchemaString); + expect(paramsString.dataSchema?.toString()).to.eq(expectedDataSchemaString); + }); + +}); diff --git a/tests/v2/input/localResponse.spec.ts b/tests/v2/input/localResponse.spec.ts index cee9495e1..89eae6d14 100644 --- a/tests/v2/input/localResponse.spec.ts +++ b/tests/v2/input/localResponse.spec.ts @@ -6,7 +6,7 @@ import path from "path"; import { V2_RESOURCE_PATH } from "../../index"; import { Buffer } from "node:buffer"; -const signature: string = "b82a515c832fd2c4f4ce3a7e6f53c12e8d10e19223f6cf0e3a9809a7a3da26be"; +const signature: string = "1df388c992d87897fe61dfc56c444c58fc3c7369c31e2b5fd20d867695e93e85"; const dummySecretKey: string = "ogNjY44MhvKPGTtVsI8zG82JqWQa68woYQH"; const filePath: string = path.join(V2_RESOURCE_PATH, "inference/standard_field_types.json");