diff --git a/README.md b/README.md index 0bf10846..f9faf4ae 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ Complete details on the working of the library are available in the following gu * [Python US Driver License OCR](https://developers.mindee.com/docs/python-eu-driver-license-ocr) * [Python FR Bank Account Detail OCR](https://developers.mindee.com/docs/python-fr-bank-account-details-ocr) * [Python FR Carte Grise OCR](https://developers.mindee.com/docs/python-fr-carte-grise-ocr) -* [Python FR Carte Vitale OCR](https://developers.mindee.com/docs/python-fr-carte-vitale-ocr) +* [Python FR Health Card OCR](https://developers.mindee.com/docs/python-fr-health-card-ocr) * [Python FR ID Card OCR](https://developers.mindee.com/docs/python-fr-carte-nationale-didentite-ocr) * [Python FR Petrol Receipts OCR](https://developers.mindee.com/docs/python-fr-petrol-receipts-ocr) * [Python US Bank Check OCR](https://developers.mindee.com/docs/python-us-bank-check-ocr) diff --git a/docs/extras/code_samples/carte_vitale_v1.txt b/docs/extras/code_samples/carte_vitale_v1.txt deleted file mode 100644 index d9f4c6a8..00000000 --- a/docs/extras/code_samples/carte_vitale_v1.txt +++ /dev/null @@ -1,17 +0,0 @@ -from mindee import Client, PredictResponse, product - -# Init a new client -mindee_client = Client(api_key="my-api-key") - -# Load a file from disk -input_doc = mindee_client.source_from_path("/path/to/the/file.ext") - -# Load a file from disk and parse it. -# The endpoint name must be specified since it cannot be determined from the class. -result: PredictResponse = mindee_client.parse(product.fr.CarteVitaleV1, input_doc) - -# Print a summary of the API result -print(result.document) - -# Print the document-level summary -# print(result.document.inference.prediction) diff --git a/docs/product/fr/carte_vitale_v1.rst b/docs/product/fr/carte_vitale_v1.rst deleted file mode 100644 index b9ef47a4..00000000 --- a/docs/product/fr/carte_vitale_v1.rst +++ /dev/null @@ -1,15 +0,0 @@ -Carte Vitale V1 ---------------- - -**Sample Code:** - -.. literalinclude:: /extras/code_samples/carte_vitale_v1.txt - :language: Python - -.. autoclass:: mindee.product.fr.carte_vitale.carte_vitale_v1.CarteVitaleV1 - :members: - :inherited-members: - -.. autoclass:: mindee.product.fr.carte_vitale.carte_vitale_v1_document.CarteVitaleV1Document - :members: - :inherited-members: diff --git a/mindee/input/sources/url_input_source.py b/mindee/input/sources/url_input_source.py index 4f26dc02..983343e5 100644 --- a/mindee/input/sources/url_input_source.py +++ b/mindee/input/sources/url_input_source.py @@ -1,4 +1,15 @@ +import os +import random +import string +from datetime import datetime +from pathlib import Path +from typing import Optional, Union +from urllib.parse import urlparse + +import requests + from mindee.error.mindee_error import MindeeSourceError +from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.local_input_source import InputType from mindee.logger import logger @@ -13,7 +24,7 @@ def __init__(self, url: str) -> None: """ Input document from a base64 encoded string. - :param url: URL to send, must be HTTPS + :param url: URL to send, must be HTTPS. """ if not url.lower().startswith("https"): raise MindeeSourceError("URL must be HTTPS") @@ -23,3 +34,175 @@ def __init__(self, url: str) -> None: logger.debug("URL input: %s", url) self.url = url + + def __fetch_file_content( + self, + username: Optional[str] = None, + password: Optional[str] = None, + token: Optional[str] = None, + headers: Optional[dict] = None, + max_redirects: int = 3, + ) -> bytes: + """ + Fetch the content of the file from the URL. + + :param username: Optional username for authentication. + :param password: Optional password for authentication. + :param token: Optional token for authentication. + :param headers: Optional additional headers for the request. + :param max_redirects: Maximum number of redirects to follow. + :return: The content of the file as bytes. + """ + if not headers: + headers = {} + if token: + headers["Authorization"] = f"Bearer {token}" + auth = None if not username or not password else (username, password) + + response = UrlInputSource.__make_request( + self.url, auth, headers, 0, max_redirects=max_redirects + ) + + return response + + def save_to_file( + self, + filepath: Union[Path, str], + filename: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + token: Optional[str] = None, + headers: Optional[dict] = None, + max_redirects: int = 3, + ) -> Path: + """ + Save the content of the URL to a file. + + :param filepath: Path to save the content to. + :param filename: Optional filename to give to the file. + :param username: Optional username for authentication. + :param password: Optional password for authentication. + :param token: Optional token for authentication. + :param headers: Optional additional headers for the request. + :param max_redirects: Maximum number of redirects to follow. + :return: The path to the saved file. + """ + response = self.__fetch_file_content( + username, password, token, headers, max_redirects + ) + filename = self.__fill_filename(filename) + full_path = Path(filepath) / filename + with open(full_path, "wb") as binary_file: + binary_file.write(response) + return full_path + + def as_local_input_source( + self, + filename: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + token: Optional[str] = None, + headers: Optional[dict] = None, + max_redirects: int = 3, + ) -> BytesInput: + """ + Convert the URL content to a BytesInput object. + + :param filename: Optional filename for the BytesInput. + :param username: Optional username for authentication. + :param password: Optional password for authentication. + :param token: Optional token for authentication. + :param headers: Optional additional headers for the request. + :param max_redirects: Maximum number of redirects to follow. + :return: A BytesInput object containing the file content. + """ + response = self.__fetch_file_content( + username, password, token, headers, max_redirects + ) + filename = self.__fill_filename(filename) + + return BytesInput(response, filename) + + @staticmethod + def __extract_filename_from_url(uri) -> str: + """ + Extract the filename from a given URL. + + :param uri: The URL to extract the filename from. + :return: The extracted filename or an empty string if not found. + """ + filename = os.path.basename(urlparse(uri).path) + return filename if filename else "" + + @staticmethod + def __generate_file_name(extension=".tmp") -> str: + """ + Generate a unique filename with a timestamp and random string. + + :param extension: The file extension to use (default is '.tmp'). + :return: A generated filename. + """ + random_string = "".join( + random.choices(string.ascii_lowercase + string.digits, k=8) + ) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + return f"mindee_temp_{timestamp}_{random_string}{extension}" + + @staticmethod + def __get_file_extension(filename) -> Optional[str]: + """ + Get the extension from a filename. + + :param filename: The filename to extract the extension from. + :return: The lowercase file extension or None if not found. + """ + ext = os.path.splitext(filename)[1] + return ext.lower() if ext else None + + def __fill_filename(self, filename=None) -> str: + """ + Fill in a filename if not provided or incomplete. + + :param filename: Optional filename to use. + :return: A complete filename. + """ + if filename is None: + filename = UrlInputSource.__extract_filename_from_url(self.url) + + if not filename or not os.path.splitext(filename)[1]: + filename = self.__generate_file_name( + extension=UrlInputSource.__get_file_extension(filename) + ) + + return filename + + @staticmethod + def __make_request(url, auth, headers, redirects, max_redirects) -> bytes: + """ + Makes an HTTP request to the given URL, while following redirections. + + :param url: The URL to request. + :param auth: Authentication tuple (username, password). + :param headers: Headers for the request. + :param redirects: Current number of redirects. + :param max_redirects: Maximum number of redirects to follow. + :return: The content of the response. + :raises MindeeSourceError: If max redirects are exceeded or the request fails. + """ + result = requests.get(url, headers=headers, timeout=120, auth=auth) + if 299 < result.status_code < 400: + if redirects == max_redirects: + raise MindeeSourceError( + f"Can't reach URL after {redirects} out of {max_redirects} redirects, " + f"aborting operation." + ) + return UrlInputSource.__make_request( + redirects.location, auth, headers, redirects + 1, max_redirects + ) + + if result.status_code >= 400 or result.status_code < 200: + raise MindeeSourceError( + f"Couldn't retrieve file from server, error code {result.status_code}." + ) + + return result.content diff --git a/mindee/product/fr/__init__.py b/mindee/product/fr/__init__.py index 020c7657..87039785 100644 --- a/mindee/product/fr/__init__.py +++ b/mindee/product/fr/__init__.py @@ -15,10 +15,6 @@ ) from mindee.product.fr.carte_grise.carte_grise_v1 import CarteGriseV1 from mindee.product.fr.carte_grise.carte_grise_v1_document import CarteGriseV1Document -from mindee.product.fr.carte_vitale.carte_vitale_v1 import CarteVitaleV1 -from mindee.product.fr.carte_vitale.carte_vitale_v1_document import ( - CarteVitaleV1Document, -) from mindee.product.fr.energy_bill.energy_bill_v1 import EnergyBillV1 from mindee.product.fr.energy_bill.energy_bill_v1_document import EnergyBillV1Document from mindee.product.fr.energy_bill.energy_bill_v1_energy_consumer import ( diff --git a/mindee/product/fr/carte_vitale/__init__.py b/mindee/product/fr/carte_vitale/__init__.py deleted file mode 100644 index 2c20894f..00000000 --- a/mindee/product/fr/carte_vitale/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from mindee.product.fr.carte_vitale.carte_vitale_v1 import CarteVitaleV1 -from mindee.product.fr.carte_vitale.carte_vitale_v1_document import ( - CarteVitaleV1Document, -) diff --git a/mindee/product/fr/carte_vitale/carte_vitale_v1.py b/mindee/product/fr/carte_vitale/carte_vitale_v1.py deleted file mode 100644 index a70184b1..00000000 --- a/mindee/product/fr/carte_vitale/carte_vitale_v1.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import List - -from mindee.parsing.common.inference import Inference -from mindee.parsing.common.page import Page -from mindee.parsing.common.string_dict import StringDict -from mindee.product.fr.carte_vitale.carte_vitale_v1_document import ( - CarteVitaleV1Document, -) - - -class CarteVitaleV1(Inference): - """Carte Vitale API version 1 inference prediction.""" - - prediction: CarteVitaleV1Document - """Document-level prediction.""" - pages: List[Page[CarteVitaleV1Document]] - """Page-level prediction(s).""" - endpoint_name = "carte_vitale" - """Name of the endpoint.""" - endpoint_version = "1" - """Version of the endpoint.""" - - def __init__(self, raw_prediction: StringDict): - """ - Carte Vitale v1 inference. - - :param raw_prediction: Raw prediction from the HTTP response. - """ - super().__init__(raw_prediction) - - self.prediction = CarteVitaleV1Document(raw_prediction["prediction"]) - self.pages = [] - for page in raw_prediction["pages"]: - try: - page_prediction = page["prediction"] - except KeyError: - continue - if page_prediction: - self.pages.append(Page(CarteVitaleV1Document, page)) diff --git a/mindee/product/fr/carte_vitale/carte_vitale_v1_document.py b/mindee/product/fr/carte_vitale/carte_vitale_v1_document.py deleted file mode 100644 index 0120f23d..00000000 --- a/mindee/product/fr/carte_vitale/carte_vitale_v1_document.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import List, Optional - -from mindee.parsing.common.prediction import Prediction -from mindee.parsing.common.string_dict import StringDict -from mindee.parsing.common.summary_helper import clean_out_string -from mindee.parsing.standard.date import DateField -from mindee.parsing.standard.text import StringField - - -class CarteVitaleV1Document(Prediction): - """Carte Vitale API version 1.1 document data.""" - - given_names: List[StringField] - """The given name(s) of the card holder.""" - issuance_date: DateField - """The date the card was issued.""" - social_security: StringField - """The Social Security Number (Numéro de Sécurité Sociale) of the card holder""" - surname: StringField - """The surname of the card holder.""" - - def __init__( - self, - raw_prediction: StringDict, - page_id: Optional[int] = None, - ): - """ - Carte Vitale document. - - :param raw_prediction: Raw prediction from HTTP response - :param page_id: Page number for multi pages pdf input - """ - super().__init__(raw_prediction, page_id) - self.given_names = [ - StringField(prediction, page_id=page_id) - for prediction in raw_prediction["given_names"] - ] - self.issuance_date = DateField( - raw_prediction["issuance_date"], - page_id=page_id, - ) - self.social_security = StringField( - raw_prediction["social_security"], - page_id=page_id, - ) - self.surname = StringField( - raw_prediction["surname"], - page_id=page_id, - ) - - def __str__(self) -> str: - given_names = f"\n { ' ' * 15 }".join( - [str(item) for item in self.given_names], - ) - out_str: str = f":Given Name(s): {given_names}\n" - out_str += f":Surname: {self.surname}\n" - out_str += f":Social Security Number: {self.social_security}\n" - out_str += f":Issuance Date: {self.issuance_date}\n" - return clean_out_string(out_str) diff --git a/tests/input/test_url_input_source_integration.py b/tests/input/test_url_input_source_integration.py new file mode 100644 index 00000000..26edbd41 --- /dev/null +++ b/tests/input/test_url_input_source_integration.py @@ -0,0 +1,68 @@ +import os +from pathlib import Path + +import pytest + +from mindee import Client +from mindee.product.invoice import InvoiceV4 + + +@pytest.fixture +def client(): + return Client() + + +@pytest.fixture +def output_file_path(): + return Path("tests/data/output/") + + +@pytest.fixture +def reference_file_path(): + return "https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/invoice_5p.pdf?raw=true" + + +@pytest.mark.integration +def test_load_local_file(client, reference_file_path): + url_source = client.source_from_url(reference_file_path) + local_source = url_source.as_local_input_source() + result = client.parse(InvoiceV4, local_source) + assert result.document.n_pages == 5 + assert result.document.filename == "invoice_5p.pdf" + + +@pytest.mark.integration +def test_custom_file_name(client, reference_file_path): + url_source = client.source_from_url(reference_file_path) + local_source = url_source.as_local_input_source("customName.pdf") + result = client.parse(InvoiceV4, local_source) + assert result.document.n_pages == 5 + assert result.document.filename == "customName.pdf" + + +@pytest.mark.integration +def test_save_file(client, reference_file_path, output_file_path): + url_source = client.source_from_url(reference_file_path) + url_source.save_to_file(output_file_path) + assert os.path.exists(os.path.join(output_file_path, "invoice_5p.pdf")) + + +@pytest.mark.integration +def test_save_file_with_filename(client, reference_file_path, output_file_path): + url_source = client.source_from_url(reference_file_path) + url_source.save_to_file(output_file_path, "customFileName.pdf") + assert os.path.exists(os.path.join(output_file_path, "customFileName.pdf")) + + +@pytest.fixture(autouse=True) +def cleanup(request, output_file_path: Path): + def remove_test_files(): + generated_files = [ + Path.resolve(output_file_path / "invoice_5p.pdf"), + Path.resolve(output_file_path / "customFileName.pdf"), + ] + for filepath in generated_files: + if os.path.exists(filepath): + os.remove(filepath) + + request.addfinalizer(remove_test_files) diff --git a/tests/product/fr/carte_vitale/__init__.py b/tests/product/fr/carte_vitale/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/product/fr/carte_vitale/test_carte_vitale_v1.py b/tests/product/fr/carte_vitale/test_carte_vitale_v1.py deleted file mode 100644 index 514d1c49..00000000 --- a/tests/product/fr/carte_vitale/test_carte_vitale_v1.py +++ /dev/null @@ -1,49 +0,0 @@ -import json - -import pytest - -from mindee.parsing.common.document import Document -from mindee.parsing.common.page import Page -from mindee.product.fr.carte_vitale.carte_vitale_v1 import CarteVitaleV1 -from mindee.product.fr.carte_vitale.carte_vitale_v1_document import ( - CarteVitaleV1Document, -) -from tests.product import PRODUCT_DATA_DIR - -RESPONSE_DIR = PRODUCT_DATA_DIR / "carte_vitale" / "response_v1" - -CarteVitaleV1DocumentType = Document[ - CarteVitaleV1Document, - Page[CarteVitaleV1Document], -] - - -@pytest.fixture -def complete_doc() -> CarteVitaleV1DocumentType: - file_path = RESPONSE_DIR / "complete.json" - with open(file_path, "r", encoding="utf-8") as open_file: - json_data = json.load(open_file) - return Document(CarteVitaleV1, json_data["document"]) - - -@pytest.fixture -def empty_doc() -> CarteVitaleV1DocumentType: - file_path = RESPONSE_DIR / "empty.json" - with open(file_path, "r", encoding="utf-8") as open_file: - json_data = json.load(open_file) - return Document(CarteVitaleV1, json_data["document"]) - - -def test_complete_doc(complete_doc: CarteVitaleV1DocumentType): - file_path = RESPONSE_DIR / "summary_full.rst" - with open(file_path, "r", encoding="utf-8") as open_file: - reference_str = open_file.read() - assert str(complete_doc) == reference_str - - -def test_empty_doc(empty_doc: CarteVitaleV1DocumentType): - prediction = empty_doc.inference.prediction - assert len(prediction.given_names) == 0 - assert prediction.surname.value is None - assert prediction.social_security.value is None - assert prediction.issuance_date.value is None diff --git a/tests/product/fr/carte_vitale/test_carte_vitale_v1_regression.py b/tests/product/fr/carte_vitale/test_carte_vitale_v1_regression.py deleted file mode 100644 index 8e9e3294..00000000 --- a/tests/product/fr/carte_vitale/test_carte_vitale_v1_regression.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -from mindee.client import Client -from mindee.product.fr.carte_vitale.carte_vitale_v1 import CarteVitaleV1 -from tests.product import PRODUCT_DATA_DIR, get_id, get_version - - -@pytest.mark.regression -def test_default_sample(): - client = Client() - with open( - PRODUCT_DATA_DIR / "carte_vitale" / "response_v1" / "default_sample.rst", - encoding="utf-8", - ) as rst_file: - rst_ref = rst_file.read() - - sample = client.source_from_path( - PRODUCT_DATA_DIR / "carte_vitale" / "default_sample.jpg", - ) - response = client.parse(CarteVitaleV1, sample) - doc_response = response.document - doc_response.id = get_id(rst_ref) - doc_response.inference.product.version = get_version(rst_ref) - assert str(doc_response) == rst_ref