diff --git a/docs/code_samples/invoice_splitter_v1_async.txt b/docs/code_samples/invoice_splitter_v1_async.txt index ea9336787..05f914da3 100644 --- a/docs/code_samples/invoice_splitter_v1_async.txt +++ b/docs/code_samples/invoice_splitter_v1_async.txt @@ -1,12 +1,9 @@ import com.mindee.MindeeClient; import com.mindee.input.LocalInputSource; import com.mindee.parsing.common.AsyncPredictResponse; -import com.mindee.parsing.common.Job; -import com.mindee.parsing.common.Document; import com.mindee.product.invoicesplitter.InvoiceSplitterV1; import java.io.File; import java.io.IOException; -import java.util.Optional; public class SimpleMindeeClient { @@ -40,4 +37,5 @@ public class SimpleMindeeClient { // page -> System.out.println(page.toString()) // ); } + } diff --git a/docs/financial_document_v1.md b/docs/financial_document_v1.md index a1aa024ea..25d098424 100644 --- a/docs/financial_document_v1.md +++ b/docs/financial_document_v1.md @@ -107,7 +107,7 @@ public class SimpleMindeeClient { ######## Document ######## -:Mindee ID: b26161ce-35d0-4984-b1ff-886645e160e6 +:Mindee ID: f469a24d-3875-4a83-ad43-e0d5aa9da604 :Filename: default_sample.jpg Inference @@ -124,8 +124,8 @@ Prediction :Document Number: INT-001 :Reference Numbers: 2412/2019 :Purchase Date: 2019-11-02 -:Due Date: 2019-02-26 -:Payment Date: 2019-02-26 +:Due Date: 2019-11-17 +:Payment Date: 2019-11-17 :Total Net: 195.00 :Total Amount: 204.75 :Taxes: @@ -176,8 +176,8 @@ Page 0 :Document Number: INT-001 :Reference Numbers: 2412/2019 :Purchase Date: 2019-11-02 -:Due Date: 2019-02-26 -:Payment Date: 2019-02-26 +:Due Date: 2019-11-17 +:Payment Date: 2019-11-17 :Total Net: 195.00 :Total Amount: 204.75 :Taxes: diff --git a/docs/invoice_splitter_v1.md b/docs/invoice_splitter_v1.md index 622c7f6fa..55adb2c1a 100644 --- a/docs/invoice_splitter_v1.md +++ b/docs/invoice_splitter_v1.md @@ -1,27 +1,22 @@ --- -title: Invoice Splitter API Java +title: Invoice Splitter OCR Java category: 622b805aaec68102ea7fcbc2 slug: java-invoice-splitter-ocr parentDoc: 631a062c3718850f3519b793 --- The Java OCR SDK supports the [Invoice Splitter API](https://platform.mindee.com/mindee/invoice_splitter). -Using [this sample](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf), we are going to illustrate how to detect the pages of multiple invoices within the same document. +Using the [sample below](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf), we are going to illustrate how to extract the data that we want using the OCR SDK. +![Invoice Splitter sample](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf?raw=true) # Quick-Start - -> **⚠️ Important:** This API only works **asynchronously**, which means that documents have to be sent and retrieved in a specific way: - ```java import com.mindee.MindeeClient; import com.mindee.input.LocalInputSource; import com.mindee.parsing.common.AsyncPredictResponse; -import com.mindee.parsing.common.Job; -import com.mindee.parsing.common.Document; import com.mindee.product.invoicesplitter.InvoiceSplitterV1; import java.io.File; import java.io.IOException; -import java.util.Optional; public class SimpleMindeeClient { @@ -37,8 +32,8 @@ public class SimpleMindeeClient { // Parse the file asynchronously AsyncPredictResponse response = mindeeClient.enqueueAndParse( - InvoiceSplitterV1.class, - inputSource + InvoiceSplitterV1.class, + inputSource ); // Print a summary of the response @@ -55,69 +50,82 @@ public class SimpleMindeeClient { // page -> System.out.println(page.toString()) // ); } + } + ``` **Output (RST):** - ```rst ######## Document ######## -:Mindee ID: 8c25cc63-212b-4537-9c9b-3fbd3bd0ee20 -:Filename: default_sample.jpg +:Mindee ID: 15ad7a19-7b75-43d0-b0c6-9a641a12b49b +:Filename: default_sample.pdf Inference ######### -:Product: mindee/carte_vitale v1.0 -:Rotation applied: Yes +:Product: mindee/invoice_splitter v1.1 +:Rotation applied: No Prediction ========== -:Given Name(s): NATHALIE -:Surname: DURAND -:Social Security Number: 269054958815780 -:Issuance Date: 2007-01-01 +:Invoice Page Groups: + :Page indexes: 0 + :Page indexes: 1 Page Predictions ================ Page 0 ------ -:Given Name(s): NATHALIE -:Surname: DURAND -:Social Security Number: 269054958815780 -:Issuance Date: 2007-01-01 +:Invoice Page Groups: + +Page 1 +------ +:Invoice Page Groups: ``` # Field Types +## Standard Fields +These fields are generic and used in several products. -## Specific Fields +### BaseField +Each prediction object contains a set of fields that inherit from the generic `BaseField` class. +A typical `BaseField` object will have the following attributes: -### Page Indexes +* **confidence** (`Double`): the confidence score of the field prediction. +* **boundingBox** (`Polygon`): contains exactly 4 relative vertices (points) coordinates of a right rectangle containing the field in the document. +* **polygon** (`Polygon`): contains the relative vertices coordinates (`polygon` extends `List`) of a polygon containing the field in the image. +* **pageId** (`Integer`): the ID of the page, always `null` when at document-level. -List of page group indexes. +> **Note:** A `Point` simply refers to a List of `Double`. -A `PageIndexes` implements the following attributes: -- **pageIndexes** (`List`): List of indexes of the pages of a single invoice. -- **confidence** (`Double`): The confidence of the prediction. +Aside from the previous attributes, all basic fields have access to a custom `toString` method that can be used to print their value as a string. -# Attributes +## Specific Fields +Fields which are specific to this product; they are not used in any other product. + +### Invoice Page Groups Field +List of page groups. Each group represents a single invoice within a multi-invoice document. + +A `InvoiceSplitterV1InvoicePageGroup` implements the following attributes: +* **pageIndexes** (`List`): List of page indexes that belong to the same invoice (group). + +# Attributes The following fields are extracted for Invoice Splitter V1: ## Invoice Page Groups - -**invoicePageGroups** (`List<`[invoicePageGroups](#page-indexes)`>`): List of page indexes that belong to the same invoice in the PDF. +**invoicePageGroups**(List<[InvoiceSplitterV1InvoicePageGroup](#invoice-page-groups-field)>): List of page groups. Each group represents a single invoice within a multi-invoice document. ```java for (invoicePageGroupsElem : result.getDocument().getInference().getPrediction().getInvoicePageGroups()) { - System.out.println(invoicePageGroupsElem); + System.out.println(invoicePageGroupsElem.value); } ``` # Questions? - [Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-2d0ds7dtz-DPAF81ZqTy20chsYpQBW5g) diff --git a/docs/invoices_v4.md b/docs/invoices_v4.md index 28b20c593..b8b9b09d6 100644 --- a/docs/invoices_v4.md +++ b/docs/invoices_v4.md @@ -107,7 +107,7 @@ public class SimpleMindeeClient { ######## Document ######## -:Mindee ID: a67b70ea-4b1e-4eac-ae75-dda47a7064ae +:Mindee ID: 86b1833f-138b-4a01-8387-860204b0e631 :Filename: default_sample.jpg Inference @@ -122,8 +122,8 @@ Prediction :Purchase Order Number: AD29094 :Reference Numbers: AD29094 :Purchase Date: 2018-09-25 -:Due Date: 2011-12-01 -:Payment Date: 2011-12-01 +:Due Date: +:Payment Date: :Total Net: 2145.00 :Total Amount: 2608.20 :Total Tax: 193.20 @@ -168,8 +168,8 @@ Page 0 :Purchase Order Number: AD29094 :Reference Numbers: AD29094 :Purchase Date: 2018-09-25 -:Due Date: 2011-12-01 -:Payment Date: 2011-12-01 +:Due Date: +:Payment Date: :Total Net: 2145.00 :Total Amount: 2608.20 :Total Tax: 193.20 diff --git a/src/main/java/com/mindee/extraction/PDFExtractor.java b/src/main/java/com/mindee/extraction/PDFExtractor.java index 46cec13d6..c1a494e40 100644 --- a/src/main/java/com/mindee/extraction/PDFExtractor.java +++ b/src/main/java/com/mindee/extraction/PDFExtractor.java @@ -5,7 +5,7 @@ import com.mindee.MindeeException; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; -import com.mindee.product.invoicesplitter.InvoiceSplitterV1Document; +import com.mindee.product.invoicesplitter.InvoiceSplitterV1InvoicePageGroup; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -118,11 +118,11 @@ public List extractSubDocuments(List> pageIndexes) * @return a list of extracted files. * @throws IOException Throws if the file can't be accessed. */ - public List extractInvoices(List pageIndexes) + public List extractInvoices(List pageIndexes) throws IOException { List> indexes = - pageIndexes.stream().map(InvoiceSplitterV1Document.PageIndexes::getPageIndexes) + pageIndexes.stream().map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes) .collect(Collectors.toList()); @@ -138,17 +138,17 @@ public List extractInvoices(List extractInvoices(List pageIndexes, + public List extractInvoices(List pageIndexes, boolean strict) throws IOException { List> correctPageIndexes = new ArrayList<>(); if (!strict) { return extractInvoices(pageIndexes); } - Iterator iterator = pageIndexes.iterator(); + Iterator iterator = pageIndexes.iterator(); List currentList = new ArrayList<>(); Double previousConfidence = null; while (iterator.hasNext()) { - InvoiceSplitterV1Document.PageIndexes pageIndex = iterator.next(); + InvoiceSplitterV1InvoicePageGroup pageIndex = iterator.next(); Double confidence = pageIndex.getConfidence(); List pageList = pageIndex.getPageIndexes(); diff --git a/src/main/java/com/mindee/product/ind/indianpassport/IndianPassportV1Document.java b/src/main/java/com/mindee/product/ind/indianpassport/IndianPassportV1Document.java index 93643d500..15aeeaa50 100644 --- a/src/main/java/com/mindee/product/ind/indianpassport/IndianPassportV1Document.java +++ b/src/main/java/com/mindee/product/ind/indianpassport/IndianPassportV1Document.java @@ -11,7 +11,7 @@ import lombok.Getter; /** - * Passport - India API version 1.1 document data. + * Passport - India API version 1.2 document data. */ @Getter @EqualsAndHashCode(callSuper = false) diff --git a/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1.java b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1.java index 1ecd0d9e9..e115f419e 100644 --- a/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1.java +++ b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1.java @@ -6,12 +6,11 @@ import lombok.Getter; /** - * The invoice splitter V1 inference. + * Invoice Splitter API version 1 inference prediction. */ @Getter @JsonIgnoreProperties(ignoreUnknown = true) @EndpointInfo(endpointName = "invoice_splitter", version = "1") -public class InvoiceSplitterV1 extends - Inference { - +public class InvoiceSplitterV1 + extends Inference { } diff --git a/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Document.java b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Document.java index b1bb84dbb..6c472ca43 100644 --- a/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Document.java +++ b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Document.java @@ -2,67 +2,50 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; +import com.mindee.parsing.SummaryHelper; import com.mindee.parsing.common.Prediction; +import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import lombok.EqualsAndHashCode; import lombok.Getter; /** - * Document data for Invoice Splitter, API version 1. + * Invoice Splitter API version 1.2 document data. */ @Getter @EqualsAndHashCode(callSuper = false) @JsonIgnoreProperties(ignoreUnknown = true) public class InvoiceSplitterV1Document extends Prediction { + /** + * List of page groups. Each group represents a single invoice within a multi-invoice document. + */ @JsonProperty("invoice_page_groups") - private List invoicePageGroups; - - @Override - public String toString() { - StringBuilder outStr = new StringBuilder(); - - outStr.append(String.format(":Invoice Page Groups: %n")); - if (invoicePageGroups != null) { - String pageGroupsString = this.getInvoicePageGroups().stream() - .map(PageIndexes::toString) - .collect(Collectors.joining(String.format("%n"))); - outStr.append(String.format("%s%n", pageGroupsString)); - } - return outStr.toString(); - } + protected List invoicePageGroups = new ArrayList<>(); @Override public boolean isEmpty() { - return invoicePageGroups.isEmpty(); + return ( + (this.invoicePageGroups == null || this.invoicePageGroups.isEmpty()) + ); } - /** - * Represents a grouping of pages. - */ - @Getter - @JsonIgnoreProperties(ignoreUnknown = true) - public static class PageIndexes { - - /** - * The confidence about the zone of the value extracted. A value from 0 to 1. - */ - @JsonProperty("confidence") - private Double confidence; - - /** - * The page indexes in the document that are grouped together - */ - @JsonProperty("page_indexes") - private List pageIndexes; - - @Override - public String toString() { - return " :Page indexes: ".concat( - pageIndexes.stream().map((index) -> index.toString()) - .collect(Collectors.joining(", ")) - ); + @Override + public String toString() { + StringBuilder outStr = new StringBuilder(); + String invoicePageGroupsSummary = ""; + if (!this.getInvoicePageGroups().isEmpty()) { + int[] invoicePageGroupsColSizes = new int[]{74}; + invoicePageGroupsSummary = + String.format("%n%s%n ", SummaryHelper.lineSeparator(invoicePageGroupsColSizes, "-")) + + "| Page Indexes " + + String.format("|%n%s%n ", SummaryHelper.lineSeparator(invoicePageGroupsColSizes, "=")); + invoicePageGroupsSummary += SummaryHelper.arrayToString(this.getInvoicePageGroups(), invoicePageGroupsColSizes); + invoicePageGroupsSummary += String.format("%n%s", SummaryHelper.lineSeparator(invoicePageGroupsColSizes, "-")); } + outStr.append( + String.format(":Invoice Page Groups: %s%n", invoicePageGroupsSummary) + ); + return SummaryHelper.cleanSummary(outStr.toString()); } } diff --git a/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1InvoicePageGroup.java b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1InvoicePageGroup.java new file mode 100644 index 000000000..42764ad07 --- /dev/null +++ b/src/main/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1InvoicePageGroup.java @@ -0,0 +1,69 @@ +package com.mindee.product.invoicesplitter; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.mindee.parsing.standard.BaseField; +import com.mindee.parsing.standard.LineItemField; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import lombok.Getter; + +/** + * List of page groups. Each group represents a single invoice within a multi-invoice document. + */ +@Getter +@JsonIgnoreProperties(ignoreUnknown = true) +public class InvoiceSplitterV1InvoicePageGroup extends BaseField implements LineItemField { + + /** + * List of page indexes that belong to the same invoice (group). + */ + @JsonProperty("page_indexes") + List pageIndexes; + + public boolean isEmpty() { + return ( + pageIndexes == null + ); + } + + private Map tablePrintableValues() { + Map printable = new HashMap<>(); + + printable.put( + "pageIndexes", + this.pageIndexes.stream() + .map(String::valueOf) + .collect(Collectors.joining(", ")) + ); + return printable; + } + + /** + * Output the line in a format suitable for inclusion in an rST table. + */ + public String toTableLine() { + Map printable = this.tablePrintableValues(); + return String.format("| %-72s |", printable.get("pageIndexes")); + } + + @Override + public String toString() { + Map printable = this.printableValues(); + return String.format("Page Indexes: %s", printable.get("pageIndexes")); + } + + private Map printableValues() { + Map printable = new HashMap<>(); + + printable.put( + "pageIndexes", + this.pageIndexes.stream() + .map(String::valueOf) + .collect(Collectors.joining(", ")) + ); + return printable; + } +} diff --git a/src/test/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Test.java b/src/test/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Test.java index 83b0dcaee..3d0a1759e 100644 --- a/src/test/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Test.java +++ b/src/test/java/com/mindee/product/invoicesplitter/InvoiceSplitterV1Test.java @@ -2,23 +2,26 @@ import com.fasterxml.jackson.databind.JavaType; import com.fasterxml.jackson.databind.ObjectMapper; -import com.mindee.parsing.common.AsyncPredictResponse; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; +import com.mindee.parsing.common.Document; +import com.mindee.parsing.common.PredictResponse; +import com.mindee.parsing.standard.ClassificationField; +import com.mindee.product.ProductTestHelper; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.io.File; +import java.io.IOException; +/** + * Unit tests for InvoiceSplitterV1. + */ public class InvoiceSplitterV1Test { - protected AsyncPredictResponse getPrediction(String name) throws IOException { + protected PredictResponse getPrediction(String name) throws IOException { ObjectMapper objectMapper = new ObjectMapper(); objectMapper.findAndRegisterModules(); JavaType type = objectMapper.getTypeFactory().constructParametricType( - AsyncPredictResponse.class, + PredictResponse.class, InvoiceSplitterV1.class ); return objectMapper.readValue( @@ -29,22 +32,19 @@ protected AsyncPredictResponse getPrediction(String name) thr @Test void whenEmptyDeserialized_mustHaveValidProperties() throws IOException { - AsyncPredictResponse response = getPrediction("empty"); - InvoiceSplitterV1Document docPrediction = response.getDocumentObj().getInference().getPrediction(); + PredictResponse response = getPrediction("empty"); + InvoiceSplitterV1Document docPrediction = response.getDocument().getInference().getPrediction(); Assertions.assertTrue(docPrediction.getInvoicePageGroups().isEmpty()); } @Test - void givenAnInvoiceSplitterResponse_whenDeserialized_MustHaveAValidSummary() throws IOException { - AsyncPredictResponse response = getPrediction("complete"); - - String[] actualLines = response.getDocumentObj().toString().split(System.lineSeparator()); - List expectedLines = Files - .readAllLines(Paths.get("src/test/resources/products/invoice_splitter/response_v1/summary_full.rst")); - String expectedSummary = String.join(String.format("%n"), expectedLines); - String actualSummary = String.join(String.format("%n"), actualLines); - - Assertions.assertEquals(expectedSummary, actualSummary); + void whenCompleteDeserialized_mustHaveValidDocumentSummary() throws IOException { + PredictResponse response = getPrediction("complete"); + Document doc = response.getDocument(); + ProductTestHelper.assertStringEqualsFile( + doc.toString(), + "src/test/resources/products/invoice_splitter/response_v1/summary_full.rst" + ); } } diff --git a/src/test/resources b/src/test/resources index 86f3c9de4..415f6bf4a 160000 --- a/src/test/resources +++ b/src/test/resources @@ -1 +1 @@ -Subproject commit 86f3c9de490a0f29e56cacee0cd63dbebfa86dc8 +Subproject commit 415f6bf4a13f38af2776cbe2222fdfab92f41ee5