Unstructured-IO · awalker4 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/gen.yaml b/gen.yaml
@@ -23,7 +23,7 @@ generation:
   schemas:
     allOfMergeStrategy: shallowMerge
 python:
-  version: 0.42.9
+  version: 0.42.10
   additionalDependencies:
     dev:
       deepdiff: '>=6.0'
@@ -40,6 +40,7 @@ python:
       cryptography: '>=3.1'
       httpx: '>=0.27.0'
       pypdf: '>= 6.2.0'
+      pypdfium: '>= 5.0.0'
       requests-toolbelt: '>=1.0.0'
   allowedRedefinedBuiltins:
     - id

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 
 [project]
 name = "unstructured-client"
-version = "0.42.9"
+version = "0.42.10"
 description = "Python Client SDK for Unstructured API"
 authors = [{ name = "Unstructured" },]
 readme = "README-PYPI.md"
@@ -13,6 +13,7 @@ dependencies = [
     "httpx >=0.27.0",
     "pydantic >=2.11.2",
     "pypdf >= 6.2.0",
+    "pypdfium2 >= 5.0.0",
     "requests-toolbelt >=1.0.0",
 ]
 

diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -18,6 +18,7 @@
 import httpx
 from httpx import AsyncClient
 from pypdf import PdfReader, PdfWriter
+import pypdfium2 as pdfium  # type: ignore[import-untyped]
 
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -349,9 +350,12 @@ def before_request(
 
         pdf = self._trim_large_pages(pdf, form_data)
 
+        pdf.stream.seek(0)
+        pdf_bytes = pdf.stream.read()
+
         if self.cache_tmp_data_feature:
             pdf_chunk_paths = self._get_pdf_chunk_paths(
-                pdf,
+                pdf_bytes,
                 operation_id=operation_id,
                 split_size=split_size,
                 page_start=page_range_start,
@@ -362,7 +366,7 @@ def before_request(
             pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths)
         else:
             pdf_chunks = self._get_pdf_chunks_in_memory(
-                pdf,
+                pdf_bytes,
                 split_size=split_size,
                 page_start=page_range_start,
                 page_end=page_range_end
@@ -467,7 +471,7 @@ def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfRea
 
     def _get_pdf_chunks_in_memory(
             self,
-            pdf: PdfReader,
+            pdf_bytes: bytes,
             split_size: int = 1,
             page_start: int = 1,
             page_end: Optional[int] = None
@@ -488,27 +492,34 @@ def _get_pdf_chunks_in_memory(
             The list of temporary file paths.
         """
 
-        offset = page_start - 1
-        offset_end = page_end or len(pdf.pages)
+        with pdfium.PdfDocument(pdf_bytes) as pdf:
+
+            offset = page_start - 1
+            offset_end = page_end if page_end else len(pdf)
+
+            while offset < offset_end:
+                end = min(offset + split_size, offset_end)
+
+                # Create new PDF
+                new_pdf = pdfium.PdfDocument.new()
 
-        chunk_no = 0
-        while offset < offset_end:
-            chunk_no += 1
-            new_pdf = PdfWriter()
-            chunk_buffer = io.BytesIO()
+                # Import pages
+                page_indices = list(range(offset, end))
+                new_pdf.import_pages(pdf, pages=page_indices)
 
-            end = min(offset + split_size, offset_end)
+                # Save to buffer
+                chunk_buffer = io.BytesIO()
+                new_pdf.save(chunk_buffer)
+                chunk_buffer.seek(0)
 
-            for page in list(pdf.pages[offset:end]):
-                new_pdf.add_page(page)
-            new_pdf.write(chunk_buffer)
-            chunk_buffer.seek(0)
-            yield chunk_buffer, offset
-            offset += split_size
+                new_pdf.close()
+
+                yield chunk_buffer, offset
+                offset += split_size
 
     def _get_pdf_chunk_paths(
         self,
-        pdf: PdfReader,
+        pdf_bytes: bytes,
         operation_id: str,
         split_size: int = 1,
         page_start: int = 1,
@@ -530,30 +541,39 @@ def _get_pdf_chunk_paths(
             The list of temporary file paths.
         """
 
-        offset = page_start - 1
-        offset_end = page_end or len(pdf.pages)
+        with pdfium.PdfDocument(pdf_bytes) as pdf:
+            offset = page_start - 1
+            offset_end = page_end if page_end else len(pdf)
 
-        tempdir = tempfile.TemporaryDirectory(  # pylint: disable=consider-using-with
-            dir=self.cache_tmp_data_dir,
-            prefix="unstructured_client_"
-        )
-        self.tempdirs[operation_id] = tempdir
-        tempdir_path = Path(tempdir.name)
-        pdf_chunk_paths: list[Tuple[Path, int]] = []
-        chunk_no = 0
-        while offset < offset_end:
-            chunk_no += 1
-            new_pdf = PdfWriter()
-
-            end = min(offset + split_size, offset_end)
-
-            for page in list(pdf.pages[offset:end]):
-                new_pdf.add_page(page)
-            with open(tempdir_path / f"chunk_{chunk_no}.pdf", "wb") as pdf_chunk:
-                new_pdf.write(pdf_chunk)
-                pdf_chunk_paths.append((Path(pdf_chunk.name), offset))
-            offset += split_size
-        return pdf_chunk_paths
+            # Create temporary directory
+            tempdir = tempfile.TemporaryDirectory(  # pylint: disable=consider-using-with
+                dir=self.cache_tmp_data_dir,
+                prefix="unstructured_client_"
+            )
+            self.tempdirs[operation_id] = tempdir
+            tempdir_path = Path(tempdir.name)
+
+            pdf_chunk_paths: list[Tuple[Path, int]] = []
+            chunk_no = 0
+
+            while offset < offset_end:
+                chunk_no += 1
+                end = min(offset + split_size, offset_end)
+
+                # Create new PDF with selected pages
+                new_pdf = pdfium.PdfDocument.new()
+                page_indices = list(range(offset, end))
+                new_pdf.import_pages(pdf, pages=page_indices)
+
+                # Save to file
+                chunk_path = tempdir_path / f"chunk_{chunk_no}.pdf"
+                new_pdf.save(str(chunk_path))  # Convert Path to string
+                new_pdf.close()
+
+                pdf_chunk_paths.append((chunk_path, offset))
+                offset += split_size
+
+            return pdf_chunk_paths
 
     def _get_pdf_chunk_files(
         self, pdf_chunks: list[Tuple[Path, int]]