Skip to content

Commit 27c5ec5

Browse files
committed
split input tests
1 parent afa6c5d commit 27c5ec5

File tree

8 files changed

+205
-193
lines changed

8 files changed

+205
-193
lines changed

tests/extraction/test_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
77
from mindee.input.sources.path_input import PathInput
88
from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
9-
from tests.input.test_inputs import PRODUCT_DATA_DIR
9+
from tests.utils import PRODUCT_DATA_DIR
1010

1111

1212
@pytest.fixture

tests/extraction/test_invoice_splitter_auto_extraction.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@
88
from mindee.parsing.common.document import Document
99
from mindee.product.invoice.invoice_v4 import InvoiceV4
1010
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
11-
from tests.input.test_inputs import PRODUCT_DATA_DIR
1211
from tests.product import get_id, get_version
13-
from tests.utils import levenshtein_ratio
12+
from tests.utils import PRODUCT_DATA_DIR, levenshtein_ratio
1413

1514

1615
@pytest.fixture

tests/extraction/test_multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
1111
MultiReceiptsDetectorV1,
1212
)
13-
from tests.input.test_inputs import PRODUCT_DATA_DIR
13+
from tests.utils import PRODUCT_DATA_DIR
1414

1515

1616
@pytest.fixture

tests/extraction/test_pdf_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
99
InvoiceSplitterV1Document,
1010
)
11-
from tests.input.test_inputs import PRODUCT_DATA_DIR
11+
from tests.utils import PRODUCT_DATA_DIR
1212

1313

1414
@pytest.fixture
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import io
2+
3+
import pypdfium2 as pdfium
4+
import pytest
5+
6+
from mindee.error import MindeeError
7+
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
8+
from mindee.input.sources import (
9+
Base64Input,
10+
BytesInput,
11+
FileInput,
12+
LocalInputSource,
13+
PathInput,
14+
)
15+
from tests.utils import FILE_TYPES_DIR, PRODUCT_DATA_DIR
16+
17+
18+
def _assert_page_options(input_source: LocalInputSource, numb_pages: int):
19+
assert input_source.is_pdf() is True
20+
# Currently the least verbose way of comparing pages with pypdfium2
21+
# I.e., each page is read and rendered as a rasterized image.
22+
# These images are then compared as raw byte sequences.
23+
cut_pdf = pdfium.PdfDocument(input_source.file_object)
24+
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
25+
for idx in range(len(pdf)):
26+
pdf_page = pdf.get_page(idx)
27+
pdf_page_render = pdfium.PdfPage.render(pdf_page)
28+
cut_pdf_page = cut_pdf.get_page(idx)
29+
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
30+
31+
assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
32+
cut_pdf.close()
33+
pdf.close()
34+
35+
36+
def test_pdf_reconstruct_ok():
37+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
38+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=range(5))
39+
assert isinstance(input_source.file_object, io.BytesIO)
40+
41+
42+
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
43+
def test_process_pdf_cut_n_pages(numb_pages: int):
44+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
45+
input_source.process_pdf(
46+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
47+
)
48+
assert input_source.page_count == numb_pages
49+
_assert_page_options(input_source, numb_pages)
50+
51+
52+
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
53+
def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
54+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
55+
input_source.apply_page_options(
56+
PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
57+
)
58+
assert input_source.count_doc_pages() == numb_pages
59+
_assert_page_options(input_source, numb_pages)
60+
61+
62+
def test_pdf_keep_5_first_pages():
63+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
64+
assert input_source.is_pdf() is True
65+
input_source.process_pdf(
66+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 2, 3, 4]
67+
)
68+
assert input_source.count_doc_pages() == 5
69+
70+
71+
def test_pdf_keep_invalid_pages():
72+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
73+
assert input_source.is_pdf() is True
74+
input_source.process_pdf(
75+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 17]
76+
)
77+
assert input_source.count_doc_pages() == 2
78+
79+
80+
def test_pdf_remove_5_last_pages():
81+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
82+
assert input_source.is_pdf() is True
83+
input_source.process_pdf(
84+
behavior=REMOVE, on_min_pages=2, page_indexes=[-5, -4, -3, -2, -1]
85+
)
86+
assert input_source.count_doc_pages() == 7
87+
88+
89+
def test_pdf_remove_5_first_pages():
90+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
91+
assert input_source.is_pdf() is True
92+
input_source.process_pdf(
93+
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(5))
94+
)
95+
assert input_source.count_doc_pages() == 7
96+
97+
98+
def test_pdf_remove_invalid_pages():
99+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
100+
assert input_source.is_pdf() is True
101+
input_source.process_pdf(behavior=REMOVE, on_min_pages=2, page_indexes=[16])
102+
assert input_source.count_doc_pages() == 12
103+
104+
105+
def test_pdf_keep_no_pages():
106+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
107+
assert input_source.is_pdf() is True
108+
# empty page indexes
109+
with pytest.raises(RuntimeError):
110+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[])
111+
# all invalid pages
112+
with pytest.raises(RuntimeError):
113+
input_source.process_pdf(
114+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[16, 17]
115+
)
116+
117+
118+
def test_pdf_remove_all_pages():
119+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
120+
assert input_source.is_pdf() is True
121+
with pytest.raises(RuntimeError):
122+
input_source.process_pdf(
123+
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(15))
124+
)
125+
126+
127+
def test_pdf_input_from_file():
128+
with open(FILE_TYPES_DIR / "pdf" / "multipage.pdf", "rb") as fp:
129+
input_source = FileInput(fp)
130+
assert input_source.is_pdf() is True
131+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
132+
assert input_source.count_doc_pages() == 1
133+
134+
135+
def test_pdf_input_from_base64():
136+
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.txt", "rt") as fp:
137+
input_source = Base64Input(fp.read(), filename="invoice_10p.pdf")
138+
assert input_source.is_pdf() is True
139+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
140+
assert input_source.count_doc_pages() == 1
141+
142+
143+
def test_pdf_input_from_bytes():
144+
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.pdf", "rb") as fp:
145+
input_source = BytesInput(fp.read(), filename="invoice_10p.pdf")
146+
assert input_source.is_pdf() is True
147+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
148+
assert input_source.count_doc_pages() == 1
149+
150+
151+
def test_pdf_blank_check():
152+
with pytest.raises(MindeeError):
153+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank.pdf")
154+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
155+
156+
with pytest.raises(MindeeError):
157+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank_1.pdf")
158+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
159+
160+
input_not_blank = PathInput(FILE_TYPES_DIR / "pdf" / "not_blank_image_only.pdf")
161+
assert input_not_blank.count_doc_pages() == 1

tests/input/test_fix_pdf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pytest
2+
3+
from mindee import PathInput
4+
from mindee.error import MimeTypeError
5+
from tests.utils import FILE_TYPES_DIR
6+
7+
8+
def test_broken_unfixable_pdf():
9+
with pytest.raises(MimeTypeError):
10+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
11+
input_source.fix_pdf()
12+
13+
14+
def test_broken_fixable_pdf():
15+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
16+
input_source.fix_pdf()
17+
assert input_source.page_count == 1
18+
19+
20+
def test_broken_fixable_invoice_pdf():
21+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
22+
input_source.fix_pdf()

0 commit comments

Comments
 (0)