diff --git a/mindee/input/__init__.py b/mindee/input/__init__.py index 008d880d..3c75c072 100644 --- a/mindee/input/__init__.py +++ b/mindee/input/__init__.py @@ -1,12 +1,9 @@ from mindee.input.local_response import LocalResponse from mindee.input.page_options import PageOptions -from mindee.input.sources import ( - Base64Input, - BytesInput, - FileInput, - InputType, - LocalInputSource, - PathInput, - UrlInputSource, -) +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource from mindee.input.workflow_options import WorkflowOptions diff --git a/mindee/input/sources/__init__.py b/mindee/input/sources/__init__.py new file mode 100644 index 00000000..6f8a51e3 --- /dev/null +++ b/mindee/input/sources/__init__.py @@ -0,0 +1,6 @@ +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource diff --git a/mindee/input/sources/base_64_input.py b/mindee/input/sources/base_64_input.py new file mode 100644 index 00000000..b651bd23 --- /dev/null +++ b/mindee/input/sources/base_64_input.py @@ -0,0 +1,20 @@ +import base64 +import io + +from mindee.input.sources.local_input_source import InputType, LocalInputSource + + +class Base64Input(LocalInputSource): + """Base64-encoded text input.""" + + def __init__(self, base64_string: str, filename: str) -> None: + """ + Input document from a base64 encoded string. + + :param base64_string: Raw data as a base64 encoded string + :param filename: File name of the input + """ + self.file_object = io.BytesIO(base64.standard_b64decode(base64_string)) + self.filename = filename + self.filepath = None + super().__init__(input_type=InputType.BASE64) diff --git a/mindee/input/sources/bytes_input.py b/mindee/input/sources/bytes_input.py new file mode 100644 index 00000000..13fbf41d --- /dev/null +++ b/mindee/input/sources/bytes_input.py @@ -0,0 +1,19 @@ +import io + +from mindee.input.sources.local_input_source import InputType, LocalInputSource + + +class BytesInput(LocalInputSource): + """Raw bytes input.""" + + def __init__(self, raw_bytes: bytes, filename: str) -> None: + """ + Input document from raw bytes (no buffer). + + :param raw_bytes: Raw data as bytes + :param filename: File name of the input + """ + self.file_object = io.BytesIO(raw_bytes) + self.filename = filename + self.filepath = None + super().__init__(input_type=InputType.BYTES) diff --git a/mindee/input/sources/file_input.py b/mindee/input/sources/file_input.py new file mode 100644 index 00000000..561fd754 --- /dev/null +++ b/mindee/input/sources/file_input.py @@ -0,0 +1,23 @@ +import os +from typing import BinaryIO + +from mindee.input.sources.local_input_source import InputType, LocalInputSource + + +class FileInput(LocalInputSource): + """A binary file input.""" + + def __init__(self, file: BinaryIO) -> None: + """ + Input document from a Python binary file object. + + Note: the calling function is responsible for closing the file. + + :param file: FileIO object + """ + assert file.name, "File name must be set" + + self.file_object = file + self.filename = os.path.basename(file.name) + self.filepath = file.name + super().__init__(input_type=InputType.FILE) diff --git a/mindee/input/sources.py b/mindee/input/sources/local_input_source.py similarity index 72% rename from mindee/input/sources.py rename to mindee/input/sources/local_input_source.py index fd0f9831..ef5bcaf5 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources/local_input_source.py @@ -1,11 +1,8 @@ -import base64 import io import mimetypes -import os import tempfile from enum import Enum -from pathlib import Path -from typing import BinaryIO, Optional, Sequence, Tuple, Union +from typing import BinaryIO, Optional, Sequence, Tuple import pypdfium2 as pdfium @@ -205,91 +202,3 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]: def close(self) -> None: """Close the file object.""" self.file_object.close() - - -class FileInput(LocalInputSource): - """A binary file input.""" - - def __init__(self, file: BinaryIO) -> None: - """ - Input document from a Python binary file object. - - Note: the calling function is responsible for closing the file. - - :param file: FileIO object - """ - assert file.name, "File name must be set" - - self.file_object = file - self.filename = os.path.basename(file.name) - self.filepath = file.name - super().__init__(input_type=InputType.FILE) - - -class PathInput(LocalInputSource): - """A local path input.""" - - def __init__(self, filepath: Union[Path, str]) -> None: - """ - Input document from a path. - - :param filepath: Path to open - """ - self.file_object = open(filepath, "rb") # pylint: disable=consider-using-with - self.filename = os.path.basename(filepath) - self.filepath = str(filepath) - super().__init__(input_type=InputType.PATH) - - -class BytesInput(LocalInputSource): - """Raw bytes input.""" - - def __init__(self, raw_bytes: bytes, filename: str) -> None: - """ - Input document from raw bytes (no buffer). - - :param raw_bytes: Raw data as bytes - :param filename: File name of the input - """ - self.file_object = io.BytesIO(raw_bytes) - self.filename = filename - self.filepath = None - super().__init__(input_type=InputType.BYTES) - - -class Base64Input(LocalInputSource): - """Base64-encoded text input.""" - - def __init__(self, base64_string: str, filename: str) -> None: - """ - Input document from a base64 encoded string. - - :param base64_string: Raw data as a base64 encoded string - :param filename: File name of the input - """ - self.file_object = io.BytesIO(base64.standard_b64decode(base64_string)) - self.filename = filename - self.filepath = None - super().__init__(input_type=InputType.BASE64) - - -class UrlInputSource: - """A local or distant URL input.""" - - url: str - """The Uniform Resource Locator.""" - - def __init__(self, url: str) -> None: - """ - Input document from a base64 encoded string. - - :param url: URL to send, must be HTTPS - """ - if not url.lower().startswith("https"): - raise MindeeSourceError("URL must be HTTPS") - - self.input_type = InputType.URL - - logger.debug("URL input: %s", url) - - self.url = url diff --git a/mindee/input/sources/path_input.py b/mindee/input/sources/path_input.py new file mode 100644 index 00000000..3f9698b4 --- /dev/null +++ b/mindee/input/sources/path_input.py @@ -0,0 +1,20 @@ +import os +from pathlib import Path +from typing import Union + +from mindee.input.sources.local_input_source import InputType, LocalInputSource + + +class PathInput(LocalInputSource): + """A local path input.""" + + def __init__(self, filepath: Union[Path, str]) -> None: + """ + Input document from a path. + + :param filepath: Path to open + """ + self.file_object = open(filepath, "rb") # pylint: disable=consider-using-with + self.filename = os.path.basename(filepath) + self.filepath = str(filepath) + super().__init__(input_type=InputType.PATH) diff --git a/mindee/input/sources/url_input_source.py b/mindee/input/sources/url_input_source.py new file mode 100644 index 00000000..4f26dc02 --- /dev/null +++ b/mindee/input/sources/url_input_source.py @@ -0,0 +1,25 @@ +from mindee.error.mindee_error import MindeeSourceError +from mindee.input.sources.local_input_source import InputType +from mindee.logger import logger + + +class UrlInputSource: + """A local or distant URL input.""" + + url: str + """The Uniform Resource Locator.""" + + def __init__(self, url: str) -> None: + """ + Input document from a base64 encoded string. + + :param url: URL to send, must be HTTPS + """ + if not url.lower().startswith("https"): + raise MindeeSourceError("URL must be HTTPS") + + self.input_type = InputType.URL + + logger.debug("URL input: %s", url) + + self.url = url