Skip to content

Commit b85baf7

Browse files
✨ add support for remote resource fetching
1 parent 6cb036b commit b85baf7

File tree

2 files changed

+252
-1
lines changed

2 files changed

+252
-1
lines changed

mindee/input/sources/url_input_source.py

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
1+
import os
2+
import random
3+
import string
4+
from datetime import datetime
5+
from pathlib import Path
6+
from typing import Optional, Union
7+
from urllib.parse import urlparse
8+
9+
import requests
10+
111
from mindee.error.mindee_error import MindeeSourceError
12+
from mindee.input.sources.bytes_input import BytesInput
213
from mindee.input.sources.local_input_source import InputType
314
from mindee.logger import logger
415

@@ -13,7 +24,7 @@ def __init__(self, url: str) -> None:
1324
"""
1425
Input document from a base64 encoded string.
1526
16-
:param url: URL to send, must be HTTPS
27+
:param url: URL to send, must be HTTPS.
1728
"""
1829
if not url.lower().startswith("https"):
1930
raise MindeeSourceError("URL must be HTTPS")
@@ -23,3 +34,175 @@ def __init__(self, url: str) -> None:
2334
logger.debug("URL input: %s", url)
2435

2536
self.url = url
37+
38+
def __fetch_file_content(
39+
self,
40+
username: Optional[str] = None,
41+
password: Optional[str] = None,
42+
token: Optional[str] = None,
43+
headers: Optional[dict] = None,
44+
max_redirects: int = 3,
45+
) -> bytes:
46+
"""
47+
Fetch the content of the file from the URL.
48+
49+
:param username: Optional username for authentication.
50+
:param password: Optional password for authentication.
51+
:param token: Optional token for authentication.
52+
:param headers: Optional additional headers for the request.
53+
:param max_redirects: Maximum number of redirects to follow.
54+
:return: The content of the file as bytes.
55+
"""
56+
if not headers:
57+
headers = {}
58+
if token:
59+
headers["Authorization"] = f"Bearer {token}"
60+
auth = None if not username or not password else (username, password)
61+
62+
response = UrlInputSource.__make_request(
63+
self.url, auth, headers, 0, max_redirects=max_redirects
64+
)
65+
66+
return response
67+
68+
def save_to_file(
69+
self,
70+
filepath: Union[Path, str],
71+
filename: Optional[str] = None,
72+
username: Optional[str] = None,
73+
password: Optional[str] = None,
74+
token: Optional[str] = None,
75+
headers: Optional[dict] = None,
76+
max_redirects: int = 3,
77+
) -> Path:
78+
"""
79+
Save the content of the URL to a file.
80+
81+
:param filepath: Path to save the content to.
82+
:param filename: Optional filename to give to the file.
83+
:param username: Optional username for authentication.
84+
:param password: Optional password for authentication.
85+
:param token: Optional token for authentication.
86+
:param headers: Optional additional headers for the request.
87+
:param max_redirects: Maximum number of redirects to follow.
88+
:return: The path to the saved file.
89+
"""
90+
response = self.__fetch_file_content(
91+
username, password, token, headers, max_redirects
92+
)
93+
filename = self.__fill_filename(filename)
94+
full_path = Path(filepath) / filename
95+
with open(full_path, "wb") as binary_file:
96+
binary_file.write(response)
97+
return full_path
98+
99+
def as_local_input_source(
100+
self,
101+
filename: Optional[str] = None,
102+
username: Optional[str] = None,
103+
password: Optional[str] = None,
104+
token: Optional[str] = None,
105+
headers: Optional[dict] = None,
106+
max_redirects: int = 3,
107+
) -> BytesInput:
108+
"""
109+
Convert the URL content to a BytesInput object.
110+
111+
:param filename: Optional filename for the BytesInput.
112+
:param username: Optional username for authentication.
113+
:param password: Optional password for authentication.
114+
:param token: Optional token for authentication.
115+
:param headers: Optional additional headers for the request.
116+
:param max_redirects: Maximum number of redirects to follow.
117+
:return: A BytesInput object containing the file content.
118+
"""
119+
response = self.__fetch_file_content(
120+
username, password, token, headers, max_redirects
121+
)
122+
filename = self.__fill_filename(filename)
123+
124+
return BytesInput(response, filename)
125+
126+
@staticmethod
127+
def __extract_filename_from_url(uri) -> str:
128+
"""
129+
Extract the filename from a given URL.
130+
131+
:param uri: The URL to extract the filename from.
132+
:return: The extracted filename or an empty string if not found.
133+
"""
134+
filename = os.path.basename(urlparse(uri).path)
135+
return filename if filename else ""
136+
137+
@staticmethod
138+
def __generate_file_name(extension=".tmp") -> str:
139+
"""
140+
Generate a unique filename with a timestamp and random string.
141+
142+
:param extension: The file extension to use (default is '.tmp').
143+
:return: A generated filename.
144+
"""
145+
random_string = "".join(
146+
random.choices(string.ascii_lowercase + string.digits, k=8)
147+
)
148+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
149+
return f"mindee_temp_{timestamp}_{random_string}{extension}"
150+
151+
@staticmethod
152+
def __get_file_extension(filename) -> Optional[str]:
153+
"""
154+
Get the extension from a filename.
155+
156+
:param filename: The filename to extract the extension from.
157+
:return: The lowercase file extension or None if not found.
158+
"""
159+
ext = os.path.splitext(filename)[1]
160+
return ext.lower() if ext else None
161+
162+
def __fill_filename(self, filename=None) -> str:
163+
"""
164+
Fill in a filename if not provided or incomplete.
165+
166+
:param filename: Optional filename to use.
167+
:return: A complete filename.
168+
"""
169+
if filename is None:
170+
filename = UrlInputSource.__extract_filename_from_url(self.url)
171+
172+
if not filename or not os.path.splitext(filename)[1]:
173+
filename = self.__generate_file_name(
174+
extension=UrlInputSource.__get_file_extension(filename)
175+
)
176+
177+
return filename
178+
179+
@staticmethod
180+
def __make_request(url, auth, headers, redirects, max_redirects) -> bytes:
181+
"""
182+
Makes an HTTP request to the given URL, while following redirections.
183+
184+
:param url: The URL to request.
185+
:param auth: Authentication tuple (username, password).
186+
:param headers: Headers for the request.
187+
:param redirects: Current number of redirects.
188+
:param max_redirects: Maximum number of redirects to follow.
189+
:return: The content of the response.
190+
:raises MindeeSourceError: If max redirects are exceeded or the request fails.
191+
"""
192+
result = requests.get(url, headers=headers, timeout=120, auth=auth)
193+
if 299 < result.status_code < 400:
194+
if redirects == max_redirects:
195+
raise MindeeSourceError(
196+
f"Can't reach URL after {redirects} out of {max_redirects} redirects, "
197+
f"aborting operation."
198+
)
199+
return UrlInputSource.__make_request(
200+
redirects.location, auth, headers, redirects + 1, max_redirects
201+
)
202+
203+
if result.status_code >= 400 or result.status_code < 200:
204+
raise MindeeSourceError(
205+
f"Couldn't retrieve file from server, error code {result.status_code}."
206+
)
207+
208+
return result.content
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import os
2+
from pathlib import Path
3+
4+
import pytest
5+
6+
from mindee import Client
7+
from mindee.product.invoice import InvoiceV4
8+
9+
10+
@pytest.fixture
11+
def client():
12+
return Client()
13+
14+
15+
@pytest.fixture
16+
def output_file_path():
17+
return Path("tests/data/output/")
18+
19+
20+
@pytest.fixture
21+
def reference_file_path():
22+
return "https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/invoice_5p.pdf?raw=true"
23+
24+
25+
@pytest.mark.integration
26+
def test_load_local_file(client, reference_file_path):
27+
url_source = client.source_from_url(reference_file_path)
28+
local_source = url_source.as_local_input_source()
29+
result = client.parse(InvoiceV4, local_source)
30+
assert result.document.n_pages == 5
31+
assert result.document.filename == "invoice_5p.pdf"
32+
33+
34+
@pytest.mark.integration
35+
def test_custom_file_name(client, reference_file_path):
36+
url_source = client.source_from_url(reference_file_path)
37+
local_source = url_source.as_local_input_source("customName.pdf")
38+
result = client.parse(InvoiceV4, local_source)
39+
assert result.document.n_pages == 5
40+
assert result.document.filename == "customName.pdf"
41+
42+
43+
@pytest.mark.integration
44+
def test_save_file(client, reference_file_path, output_file_path):
45+
url_source = client.source_from_url(reference_file_path)
46+
url_source.save_to_file(output_file_path)
47+
assert os.path.exists(os.path.join(output_file_path, "invoice_5p.pdf"))
48+
49+
50+
@pytest.mark.integration
51+
def test_save_file_with_filename(client, reference_file_path, output_file_path):
52+
url_source = client.source_from_url(reference_file_path)
53+
url_source.save_to_file(output_file_path, "customFileName.pdf")
54+
assert os.path.exists(os.path.join(output_file_path, "customFileName.pdf"))
55+
56+
57+
@pytest.fixture(autouse=True)
58+
def cleanup(request, output_file_path: Path):
59+
def remove_test_files():
60+
generated_files = [
61+
Path.resolve(output_file_path / "invoice_5p.pdf"),
62+
Path.resolve(output_file_path / "customFileName.pdf"),
63+
]
64+
for filepath in generated_files:
65+
if os.path.exists(filepath):
66+
os.remove(filepath)
67+
68+
request.addfinalizer(remove_test_files)

0 commit comments

Comments
 (0)