Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion gen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ generation:
schemas:
allOfMergeStrategy: shallowMerge
python:
version: 0.42.9
version: 0.42.10
additionalDependencies:
dev:
deepdiff: '>=6.0'
Expand All @@ -40,6 +40,7 @@ python:
cryptography: '>=3.1'
httpx: '>=0.27.0'
pypdf: '>= 6.2.0'
pypdfium: '>= 5.0.0'
requests-toolbelt: '>=1.0.0'
allowedRedefinedBuiltins:
- id
Expand Down
38 changes: 35 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

[project]
name = "unstructured-client"
version = "0.42.9"
version = "0.42.10"
description = "Python Client SDK for Unstructured API"
authors = [{ name = "Unstructured" },]
readme = "README-PYPI.md"
Expand All @@ -13,6 +13,7 @@ dependencies = [
"httpx >=0.27.0",
"pydantic >=2.11.2",
"pypdf >= 6.2.0",
"pypdfium2 >= 5.0.0",
"requests-toolbelt >=1.0.0",
]

Expand Down
102 changes: 61 additions & 41 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import httpx
from httpx import AsyncClient
from pypdf import PdfReader, PdfWriter
import pypdfium2 as pdfium # type: ignore[import-untyped]

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
Expand Down Expand Up @@ -349,9 +350,12 @@ def before_request(

pdf = self._trim_large_pages(pdf, form_data)

pdf.stream.seek(0)
pdf_bytes = pdf.stream.read()

if self.cache_tmp_data_feature:
pdf_chunk_paths = self._get_pdf_chunk_paths(
pdf,
pdf_bytes,
operation_id=operation_id,
split_size=split_size,
page_start=page_range_start,
Expand All @@ -362,7 +366,7 @@ def before_request(
pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths)
else:
pdf_chunks = self._get_pdf_chunks_in_memory(
pdf,
pdf_bytes,
split_size=split_size,
page_start=page_range_start,
page_end=page_range_end
Expand Down Expand Up @@ -467,7 +471,7 @@ def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfRea

def _get_pdf_chunks_in_memory(
self,
pdf: PdfReader,
pdf_bytes: bytes,
split_size: int = 1,
page_start: int = 1,
page_end: Optional[int] = None
Expand All @@ -488,27 +492,34 @@ def _get_pdf_chunks_in_memory(
The list of temporary file paths.
"""

offset = page_start - 1
offset_end = page_end or len(pdf.pages)
with pdfium.PdfDocument(pdf_bytes) as pdf:

offset = page_start - 1
offset_end = page_end if page_end else len(pdf)

while offset < offset_end:
end = min(offset + split_size, offset_end)

# Create new PDF
new_pdf = pdfium.PdfDocument.new()

chunk_no = 0
while offset < offset_end:
chunk_no += 1
new_pdf = PdfWriter()
chunk_buffer = io.BytesIO()
# Import pages
page_indices = list(range(offset, end))
new_pdf.import_pages(pdf, pages=page_indices)

end = min(offset + split_size, offset_end)
# Save to buffer
chunk_buffer = io.BytesIO()
new_pdf.save(chunk_buffer)
chunk_buffer.seek(0)

for page in list(pdf.pages[offset:end]):
new_pdf.add_page(page)
new_pdf.write(chunk_buffer)
chunk_buffer.seek(0)
yield chunk_buffer, offset
offset += split_size
new_pdf.close()

yield chunk_buffer, offset
offset += split_size

def _get_pdf_chunk_paths(
self,
pdf: PdfReader,
pdf_bytes: bytes,
operation_id: str,
split_size: int = 1,
page_start: int = 1,
Expand All @@ -530,30 +541,39 @@ def _get_pdf_chunk_paths(
The list of temporary file paths.
"""

offset = page_start - 1
offset_end = page_end or len(pdf.pages)
with pdfium.PdfDocument(pdf_bytes) as pdf:
offset = page_start - 1
offset_end = page_end if page_end else len(pdf)

tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with
dir=self.cache_tmp_data_dir,
prefix="unstructured_client_"
)
self.tempdirs[operation_id] = tempdir
tempdir_path = Path(tempdir.name)
pdf_chunk_paths: list[Tuple[Path, int]] = []
chunk_no = 0
while offset < offset_end:
chunk_no += 1
new_pdf = PdfWriter()

end = min(offset + split_size, offset_end)

for page in list(pdf.pages[offset:end]):
new_pdf.add_page(page)
with open(tempdir_path / f"chunk_{chunk_no}.pdf", "wb") as pdf_chunk:
new_pdf.write(pdf_chunk)
pdf_chunk_paths.append((Path(pdf_chunk.name), offset))
offset += split_size
return pdf_chunk_paths
# Create temporary directory
tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with
dir=self.cache_tmp_data_dir,
prefix="unstructured_client_"
)
self.tempdirs[operation_id] = tempdir
tempdir_path = Path(tempdir.name)

pdf_chunk_paths: list[Tuple[Path, int]] = []
chunk_no = 0

while offset < offset_end:
chunk_no += 1
end = min(offset + split_size, offset_end)

# Create new PDF with selected pages
new_pdf = pdfium.PdfDocument.new()
page_indices = list(range(offset, end))
new_pdf.import_pages(pdf, pages=page_indices)

# Save to file
chunk_path = tempdir_path / f"chunk_{chunk_no}.pdf"
new_pdf.save(str(chunk_path)) # Convert Path to string
new_pdf.close()

pdf_chunk_paths.append((chunk_path, offset))
offset += split_size

return pdf_chunk_paths

def _get_pdf_chunk_files(
self, pdf_chunks: list[Tuple[Path, int]]
Expand Down
Loading