From 8e5842087c412394aed6b40e8b6bccf3139e52b4 Mon Sep 17 00:00:00 2001 From: HungKNguyen Date: Fri, 30 Jan 2026 15:06:58 +0700 Subject: [PATCH] feat: add security changes --- CHANGELOG.md | 46 ++ docs/MIGRATION.md | 75 +++ pyproject.toml | 2 +- src/nutrient_dws/__init__.py | 6 +- src/nutrient_dws/builder/builder.py | 14 +- src/nutrient_dws/builder/constant.py | 16 +- src/nutrient_dws/builder/staged_builders.py | 8 +- src/nutrient_dws/client.py | 700 +++++++++----------- src/nutrient_dws/inputs.py | 84 +-- tests/test_integration.py | 3 +- tests/unit/test_client.py | 32 - tests/unit/test_inputs.py | 134 +--- 12 files changed, 500 insertions(+), 620 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 docs/MIGRATION.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e6fcf67 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,46 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [3.0.0] - 2026-01-30 + +### Security + +- **CRITICAL**: Removed client-side URL fetching to prevent SSRF vulnerabilities +- URLs are now passed to the server for secure server-side fetching +- Restricted `sign()` method to local files only (API limitation) + +### Changed + +- **BREAKING**: `sign()` only accepts local files (paths, bytes, file objects) - no URLs +- **BREAKING**: Most methods now accept `FileInputWithUrl` - URLs passed to server +- **BREAKING**: Removed client-side PDF parsing - leverage API's negative index support +- Methods like `rotate()`, `split()`, `deletePages()` now support negative indices (-1 = last page) +- All methods except `sign()` accept URLs that are passed securely to the server + +### Removed + +- **BREAKING**: Removed `process_remote_file_input()` from public API (security risk) +- **BREAKING**: Removed `get_pdf_page_count()` from public API (client-side PDF parsing) +- **BREAKING**: Removed `is_valid_pdf()` from public API (internal use only) +- Removed ~200 lines of client-side PDF parsing code + +### Added + +- SSRF protection documentation in README +- Migration guide (docs/MIGRATION.md) +- Security best practices for handling remote files +- Support for negative page indices in all page-based methods + +## [2.0.0] - 2025-01-09 + +- Initial stable release with full API coverage +- Async-first design with httpx and aiofiles +- Comprehensive type hints and mypy strict mode +- Workflow builder with staged pattern +- Error hierarchy with typed exceptions diff --git a/docs/MIGRATION.md b/docs/MIGRATION.md new file mode 100644 index 0000000..19f6c9c --- /dev/null +++ b/docs/MIGRATION.md @@ -0,0 +1,75 @@ +# Migration Guide: v2.x to v3.0 + +## Overview + +Version 3.0.0 introduces SSRF protection and removes client-side PDF parsing. + +## Key Changes + +### 1. `sign()` No Longer Accepts URLs (API Limitation) + +**Before (v2.x)**: +```python +result = await client.sign('https://example.com/document.pdf', {...}) +``` + +**After (v3.0)** - Fetch file first: +```python +import httpx + +async with httpx.AsyncClient() as http: + url = 'https://example.com/document.pdf' + + # IMPORTANT: Validate URL + if not url.startswith('https://trusted-domain.com/'): + raise ValueError('URL not from trusted domain') + + response = await http.get(url, timeout=10.0) + response.raise_for_status() + pdf_bytes = response.content + +result = await client.sign(pdf_bytes, {...}) +``` + +### 2. Most Methods Now Accept URLs (Passed to Server) + +Good news! These methods now support URLs passed securely to the server: +- `rotate()`, `split()`, `add_page()`, `duplicate_pages()`, `delete_pages()` +- `set_page_labels()`, `set_metadata()`, `optimize()` +- `flatten()`, `apply_instant_json()`, `apply_xfdf()` +- All redaction methods +- `convert()`, `ocr()`, `watermark_*()`, `extract_*()`, `merge()`, `password_protect()` + +**Example**: +```python +# This now works! +result = await client.rotate('https://example.com/doc.pdf', 90, pages={'start': 0, 'end': 5}) +``` + +### 3. Negative Page Indices Now Supported + +Use negative indices for "from end" references: +- `-1` = last page +- `-2` = second-to-last page +- etc. + +**Examples**: +```python +# Rotate last 3 pages +await client.rotate(pdf, 90, pages={'start': -3, 'end': -1}) + +# Delete first and last pages +await client.delete_pages(pdf, [0, -1]) + +# Split: keep middle pages, excluding first and last +await client.split(pdf, [{'start': 1, 'end': -2}]) +``` + +### 4. Removed from Public API + +- `process_remote_file_input()` - No longer needed (URLs passed to server) +- `get_pdf_page_count()` - Use negative indices instead +- `is_valid_pdf()` - Let server validate (internal use only) + +**Still Available:** +- `is_remote_file_input()` - Helper to detect if input is a URL (still public) diff --git a/pyproject.toml b/pyproject.toml index 507caaa..340a47e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ nutrient_dws_scripts = [ [project] name = "nutrient-dws" -version = "2.0.0" +version = "3.0.0" description = "Python client library for Nutrient Document Web Services API" readme = "README.md" requires-python = ">=3.10" diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index 0cab185..277407d 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -12,9 +12,10 @@ ValidationError, ) from nutrient_dws.inputs import ( + FileInputWithUrl, + LocalFileInput, is_remote_file_input, process_file_input, - process_remote_file_input, validate_file_input, ) from nutrient_dws.utils import get_library_version, get_user_agent @@ -22,6 +23,8 @@ __all__ = [ "APIError", "AuthenticationError", + "FileInputWithUrl", + "LocalFileInput", "NetworkError", "NutrientClient", "NutrientError", @@ -30,6 +33,5 @@ "get_user_agent", "is_remote_file_input", "process_file_input", - "process_remote_file_input", "validate_file_input", ] diff --git a/src/nutrient_dws/builder/builder.py b/src/nutrient_dws/builder/builder.py index 5497e88..9d0e5a0 100644 --- a/src/nutrient_dws/builder/builder.py +++ b/src/nutrient_dws/builder/builder.py @@ -26,7 +26,7 @@ NutrientClientOptions, ) from nutrient_dws.inputs import ( - FileInput, + FileInputWithUrl, NormalizedFileData, is_remote_file_input, process_file_input, @@ -76,16 +76,16 @@ def __init__(self, client_options: NutrientClientOptions) -> None: """ super().__init__(client_options) self.build_instructions: BuildInstructions = {"parts": []} - self.assets: dict[str, FileInput] = {} + self.assets: dict[str, FileInputWithUrl] = {} self.asset_index = 0 self.current_step = 0 self.is_executed = False - def _register_asset(self, asset: FileInput) -> str: + def _register_asset(self, asset: FileInputWithUrl) -> str: """Register an asset in the workflow and return its key for use in actions. Args: - asset: The asset to register + asset: The asset to register (must be local, not URL) Returns: The asset key that can be used in BuildActions @@ -188,7 +188,7 @@ def _cleanup(self) -> None: def add_file_part( self, - file: FileInput, + file: FileInputWithUrl, options: FilePartOptions | None = None, actions: list[ApplicableAction] | None = None, ) -> WorkflowWithPartsStage: @@ -229,8 +229,8 @@ def add_file_part( def add_html_part( self, - html: FileInput, - assets: list[FileInput] | None = None, + html: FileInputWithUrl, + assets: list[FileInputWithUrl] | None = None, options: HTMLPartOptions | None = None, actions: list[ApplicableAction] | None = None, ) -> WorkflowWithPartsStage: diff --git a/src/nutrient_dws/builder/constant.py b/src/nutrient_dws/builder/constant.py index b61ea5c..437f52d 100644 --- a/src/nutrient_dws/builder/constant.py +++ b/src/nutrient_dws/builder/constant.py @@ -1,7 +1,7 @@ from collections.abc import Callable from typing import Any, Literal, Protocol, TypeVar, cast -from nutrient_dws.inputs import FileInput +from nutrient_dws.inputs import FileInputWithUrl from nutrient_dws.types.build_actions import ( ApplyInstantJsonAction, ApplyRedactionsAction, @@ -53,7 +53,7 @@ class ActionWithFileInput(Protocol): """Internal action type that holds FileInput for deferred registration.""" __needsFileRegistration: bool - fileInput: FileInput + fileInput: FileInputWithUrl createAction: Callable[[FileHandle], BuildAction] @@ -133,7 +133,7 @@ def watermark_text( @staticmethod def watermark_image( - image: FileInput, options: ImageWatermarkActionOptions | None = None + image: FileInputWithUrl, options: ImageWatermarkActionOptions | None = None ) -> ActionWithFileInput: """Create an image watermark action. @@ -163,7 +163,7 @@ class ImageWatermarkActionWithFileInput(ActionWithFileInput): __needsFileRegistration = True def __init__( - self, file_input: FileInput, opts: ImageWatermarkActionOptions + self, file_input: FileInputWithUrl, opts: ImageWatermarkActionOptions ): self.fileInput = file_input self.options = opts @@ -196,7 +196,7 @@ def flatten(annotation_ids: list[str | int] | None = None) -> FlattenAction: return result @staticmethod - def apply_instant_json(file: FileInput) -> ActionWithFileInput: + def apply_instant_json(file: FileInputWithUrl) -> ActionWithFileInput: """Create an apply Instant JSON action. Args: @@ -209,7 +209,7 @@ def apply_instant_json(file: FileInput) -> ActionWithFileInput: class ApplyInstantJsonActionWithFileInput(ActionWithFileInput): __needsFileRegistration = True - def __init__(self, file_input: FileInput): + def __init__(self, file_input: FileInputWithUrl): self.fileInput = file_input def createAction(self, fileHandle: FileHandle) -> ApplyInstantJsonAction: @@ -222,7 +222,7 @@ def createAction(self, fileHandle: FileHandle) -> ApplyInstantJsonAction: @staticmethod def apply_xfdf( - file: FileInput, options: ApplyXfdfActionOptions | None = None + file: FileInputWithUrl, options: ApplyXfdfActionOptions | None = None ) -> ActionWithFileInput: """Create an apply XFDF action. @@ -240,7 +240,7 @@ class ApplyXfdfActionWithFileInput(ActionWithFileInput): __needsFileRegistration = True def __init__( - self, file_input: FileInput, opts: ApplyXfdfActionOptions | None + self, file_input: FileInputWithUrl, opts: ApplyXfdfActionOptions | None ): self.fileInput = file_input self.options = opts or {} diff --git a/src/nutrient_dws/builder/staged_builders.py b/src/nutrient_dws/builder/staged_builders.py index bfd0743..bbfb5bd 100644 --- a/src/nutrient_dws/builder/staged_builders.py +++ b/src/nutrient_dws/builder/staged_builders.py @@ -10,7 +10,7 @@ from nutrient_dws.types.build_actions import BuildAction if TYPE_CHECKING: - from nutrient_dws.inputs import FileInput + from nutrient_dws.inputs import FileInputWithUrl from nutrient_dws.types.analyze_response import AnalyzeBuildResponse from nutrient_dws.types.build_output import ( ImageOutputOptions, @@ -114,7 +114,7 @@ class WorkflowInitialStage(ABC): @abstractmethod def add_file_part( self, - file: FileInput, + file: FileInputWithUrl, options: FilePartOptions | None = None, actions: list[ApplicableAction] | None = None, ) -> WorkflowWithPartsStage: @@ -124,8 +124,8 @@ def add_file_part( @abstractmethod def add_html_part( self, - html: FileInput, - assets: list[FileInput] | None = None, + html: FileInputWithUrl, + assets: list[FileInputWithUrl] | None = None, options: HTMLPartOptions | None = None, actions: list[ApplicableAction] | None = None, ) -> WorkflowWithPartsStage: diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 38e2b2e..6e68ecf 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -25,12 +25,10 @@ send_request, ) from nutrient_dws.inputs import ( - FileInput, - get_pdf_page_count, - is_remote_file_input, + FileInputWithUrl, + LocalFileInput, is_valid_pdf, process_file_input, - process_remote_file_input, ) from nutrient_dws.types.account_info import AccountInfo from nutrient_dws.types.build_actions import ( @@ -299,14 +297,18 @@ def _process_typed_workflow_result( async def sign( self, - pdf: FileInput, + pdf: LocalFileInput, data: CreateDigitalSignature | None = None, options: SignRequestOptions | None = None, ) -> BufferOutput: """Sign a PDF document. + **Security Note**: This method only accepts local files (paths, bytes, file objects) + due to an API limitation. URLs are not supported. For remote files, fetch them first + with proper URL validation. + Args: - pdf: The PDF file to sign + pdf: The local PDF file to sign (no URLs) data: Signature data options: Additional options (image, graphicImage) @@ -315,12 +317,29 @@ async def sign( Example: ```python + # Example 1: Sign a local file result = await client.sign('document.pdf', { 'signatureType': 'cms', 'flatten': False, 'cadesLevel': 'b-lt' }) + # Example 2: Sign a remote file (fetch first) + import httpx + async with httpx.AsyncClient() as http: + # Validate URL before fetching + url = 'https://trusted-domain.com/document.pdf' + if not url.startswith('https://trusted-domain.com/'): + raise ValueError('URL not from trusted domain') + + response = await http.get(url, timeout=10.0) + response.raise_for_status() + pdf_bytes = response.content + + result = await client.sign(pdf_bytes, { + 'signatureType': 'cms' + }) + # Access the signed PDF buffer pdf_buffer = result['buffer'] @@ -332,35 +351,24 @@ async def sign( f.write(pdf_buffer) ``` """ - # Normalize the file input - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) + # Process as local file only (no URL support) + normalized_file = await process_file_input(pdf) if not is_valid_pdf(normalized_file[0]): raise ValidationError("Invalid pdf file", {"input": pdf}) - # Prepare optional files + # Prepare optional files (local files only) normalized_image = None normalized_graphic_image = None if options: if "image" in options: image = options["image"] - if is_remote_file_input(image): - normalized_image = await process_remote_file_input(str(image)) - else: - normalized_image = await process_file_input(image) + normalized_image = await process_file_input(image) if "graphicImage" in options: graphic_image = options["graphicImage"] - if is_remote_file_input(graphic_image): - normalized_graphic_image = await process_remote_file_input( - str(graphic_image) - ) - else: - normalized_graphic_image = await process_file_input(graphic_image) + normalized_graphic_image = await process_file_input(graphic_image) request_data = { "file": normalized_file, @@ -392,15 +400,17 @@ async def sign( async def watermark_text( self, - file: FileInput, + file: FileInputWithUrl, text: str, options: TextWatermarkActionOptions | None = None, ) -> BufferOutput: """Add a text watermark to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to watermark + file: The input file to watermark (URLs supported) text: The watermark text options: Watermark options @@ -414,6 +424,9 @@ async def watermark_text( 'fontSize': 24 }) + # Works with URLs too + result = await client.watermark_text('https://example.com/doc.pdf', 'CONFIDENTIAL') + # Access the watermarked PDF buffer pdf_buffer = result['buffer'] @@ -431,15 +444,17 @@ async def watermark_text( async def watermark_image( self, - file: FileInput, - image: FileInput, + file: FileInputWithUrl, + image: FileInputWithUrl, options: ImageWatermarkActionOptions | None = None, ) -> BufferOutput: """Add an image watermark to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to watermark + file: The input file to watermark (URLs supported) image: The watermark image. Can be a file path (string or Path), bytes, file-like object, or a URL to a remote image. options: Watermark options @@ -482,14 +497,16 @@ async def watermark_image( async def convert( self, - file: FileInput, + file: FileInputWithUrl, target_format: OutputFormat, ) -> BufferOutput | ContentOutput | JsonContentOutput: """Convert a document to a different format. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to convert + file: The input file to convert (URLs supported) target_format: The target format to convert to Returns: @@ -508,6 +525,9 @@ async def convert( # Convert to HTML html_result = await client.convert('document.pdf', 'html') html_content = html_result['content'] + + # Works with URLs + pdf_result = await client.convert('https://example.com/document.docx', 'pdf') ``` """ builder = self.workflow().add_file_part(file) @@ -540,12 +560,14 @@ async def convert( async def ocr( self, - file: FileInput, + file: FileInputWithUrl, language: OcrLanguage | list[OcrLanguage], ) -> BufferOutput: """Perform OCR (Optical Character Recognition) on a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: file: The input file to perform OCR on language: The language(s) to use for OCR. Can be a single language @@ -575,14 +597,16 @@ async def ocr( async def extract_text( self, - file: FileInput, + file: FileInputWithUrl, pages: PageRange | None = None, ) -> JsonContentOutput: """Extract text content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract text from + file: The file to extract text from (URLs supported) pages: Optional page range to extract text from Returns: @@ -596,6 +620,9 @@ async def extract_text( # Extract text from specific pages result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) + # Works with URLs + result = await client.extract_text('https://example.com/doc.pdf') + # Access the extracted text content text_content = result['data']['pages'][0]['plainText'] ``` @@ -621,14 +648,16 @@ async def extract_text( async def extract_table( self, - file: FileInput, + file: FileInputWithUrl, pages: PageRange | None = None, ) -> JsonContentOutput: """Extract table content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract table from + file: The file to extract table from (URLs supported) pages: Optional page range to extract tables from Returns: @@ -638,6 +667,9 @@ async def extract_table( ```python result = await client.extract_table('document.pdf') + # Works with URLs + result = await client.extract_table('https://example.com/doc.pdf') + # Access the extracted tables tables = result['data']['pages'][0]['tables'] @@ -668,14 +700,16 @@ async def extract_table( async def extract_key_value_pairs( self, - file: FileInput, + file: FileInputWithUrl, pages: PageRange | None = None, ) -> JsonContentOutput: """Extract key value pair content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract KVPs from + file: The file to extract KVPs from (URLs supported) pages: Optional page range to extract KVPs from Returns: @@ -685,6 +719,9 @@ async def extract_key_value_pairs( ```python result = await client.extract_key_value_pairs('document.pdf') + # Works with URLs + result = await client.extract_key_value_pairs('https://example.com/doc.pdf') + # Access the extracted key-value pairs kvps = result['data']['pages'][0]['keyValuePairs'] @@ -718,14 +755,17 @@ async def extract_key_value_pairs( async def set_page_labels( self, - pdf: FileInput, + pdf: FileInputWithUrl, labels: list[Label], ) -> BufferOutput: """Set page labels for a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) labels: Array of label objects with pages and label properties Returns: @@ -737,17 +777,12 @@ async def set_page_labels( {'pages': [0, 1, 2], 'label': 'Cover'}, {'pages': [3, 4, 5], 'label': 'Chapter 1'} ]) + + # Works with URLs + result = await client.set_page_labels('https://example.com/doc.pdf', labels) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf) @@ -759,7 +794,7 @@ async def set_page_labels( async def password_protect( self, - file: FileInput, + file: FileInputWithUrl, user_password: str, owner_password: str, permissions: list[PDFUserPermission] | None = None, @@ -767,8 +802,10 @@ async def password_protect( """Password protect a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to protect + file: The file to protect (URLs supported) user_password: Password required to open the document owner_password: Password required to modify the document permissions: Optional array of permissions granted when opened with user password @@ -805,14 +842,17 @@ async def password_protect( async def set_metadata( self, - pdf: FileInput, + pdf: FileInputWithUrl, metadata: Metadata, ) -> BufferOutput: """Set metadata for a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) metadata: The metadata to set (title and/or author) Returns: @@ -824,17 +864,12 @@ async def set_metadata( 'title': 'My Document', 'author': 'John Doe' }) + + # Works with URLs + result = await client.set_metadata('https://example.com/doc.pdf', metadata) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf) @@ -846,15 +881,18 @@ async def set_metadata( async def apply_instant_json( self, - pdf: FileInput, - instant_json_file: FileInput, + pdf: FileInputWithUrl, + instant_json_file: FileInputWithUrl, ) -> BufferOutput: """Apply Instant JSON to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify - instant_json_file: The Instant JSON file to apply + pdf: The PDF file to modify (URLs supported) + instant_json_file: The Instant JSON file to apply (URLs supported) Returns: The modified document @@ -862,17 +900,15 @@ async def apply_instant_json( Example: ```python result = await client.apply_instant_json('document.pdf', 'annotations.json') + + # Works with URLs + result = await client.apply_instant_json( + 'https://example.com/doc.pdf', + 'https://example.com/annotations.json' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it apply_json_action = BuildActions.apply_instant_json(instant_json_file) result = ( @@ -886,16 +922,19 @@ async def apply_instant_json( async def apply_xfdf( self, - pdf: FileInput, - xfdf_file: FileInput, + pdf: FileInputWithUrl, + xfdf_file: FileInputWithUrl, options: ApplyXfdfActionOptions | None = None, ) -> BufferOutput: """Apply XFDF to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify - xfdf_file: The XFDF file to apply + pdf: The PDF file to modify (URLs supported) + xfdf_file: The XFDF file to apply (URLs supported) options: Optional settings for applying XFDF Returns: @@ -904,22 +943,21 @@ async def apply_xfdf( Example: ```python result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') - # Or with options: + + # With options: result = await client.apply_xfdf( 'document.pdf', 'annotations.xfdf', {'ignorePageRotation': True, 'richTextEnabled': False} ) + + # Works with URLs + result = await client.apply_xfdf( + 'https://example.com/doc.pdf', + 'https://example.com/annotations.xfdf' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it apply_xfdf_action = BuildActions.apply_xfdf(xfdf_file, options) result = ( @@ -931,12 +969,14 @@ async def apply_xfdf( return cast("BufferOutput", self._process_typed_workflow_result(result)) - async def merge(self, files: list[FileInput]) -> BufferOutput: + async def merge(self, files: list[FileInputWithUrl]) -> BufferOutput: """Merge multiple documents into a single document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - files: The files to merge + files: The files to merge (URLs supported) Returns: The merged document @@ -945,6 +985,13 @@ async def merge(self, files: list[FileInput]) -> BufferOutput: ```python result = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) + # Works with URLs + result = await client.merge([ + 'https://example.com/doc1.pdf', + 'doc2.pdf', + 'https://example.com/doc3.pdf' + ]) + # Access the merged PDF buffer pdf_buffer = result['buffer'] ``` @@ -966,12 +1013,15 @@ async def merge(self, files: list[FileInput]) -> BufferOutput: async def flatten( self, - pdf: FileInput, + pdf: FileInputWithUrl, annotation_ids: list[str | int] | None = None, ) -> BufferOutput: """Flatten annotations in a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: pdf: The PDF file to flatten annotation_ids: Optional list of specific annotation IDs to flatten. @@ -996,15 +1046,7 @@ async def flatten( result = await client.flatten('annotated-document.pdf', ['note1', 2, 'highlight3']) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it flatten_action = BuildActions.flatten(annotation_ids) result = ( @@ -1018,7 +1060,7 @@ async def flatten( async def create_redactions_ai( self, - pdf: FileInput, + pdf: LocalFileInput, criteria: str, redaction_state: Literal["stage", "apply"] = "stage", pages: PageRange | None = None, @@ -1026,8 +1068,11 @@ async def create_redactions_ai( ) -> BufferOutput: """Use AI to redact sensitive information in a document. + **Security Note**: This method only accepts local files (direct API call). + For remote files, fetch them first with proper validation. + Args: - pdf: The PDF file to redact + pdf: The PDF file to redact (local files only, no URLs) criteria: AI redaction criteria redaction_state: Whether to stage or apply redactions (default: 'stage') pages: Optional pages to redact @@ -1052,17 +1097,11 @@ async def create_redactions_ai( ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) + # Process local files only + normalized_file = await process_file_input(pdf) - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # Use pages directly - no page count computation needed + normalized_pages = normalize_page_params(pages) if pages else None document_data: dict[str, Any] = { "file": "file", @@ -1108,7 +1147,7 @@ async def create_redactions_ai( async def create_redactions_preset( self, - pdf: FileInput, + pdf: FileInputWithUrl, preset: SearchPreset, redaction_state: Literal["stage", "apply"] = "stage", pages: PageRange | None = None, @@ -1118,11 +1157,14 @@ async def create_redactions_preset( """Create redaction annotations based on a preset pattern. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) preset: The preset pattern to search for (e.g., 'email-address', 'social-security-number') redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) preset_options: Optional settings for the preset strategy options: Optional settings for creating redactions @@ -1132,29 +1174,25 @@ async def create_redactions_preset( Example: ```python result = await client.create_redactions_preset('document.pdf', 'email-address') + + # Works with URLs + result = await client.create_redactions_preset( + 'https://example.com/doc.pdf', + 'social-security-number' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = preset_options.copy() if preset_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_preset( preset, options, strategy_options @@ -1175,7 +1213,7 @@ async def create_redactions_preset( async def create_redactions_regex( self, - pdf: FileInput, + pdf: FileInputWithUrl, regex: str, redaction_state: Literal["stage", "apply"] = "stage", pages: PageRange | None = None, @@ -1185,11 +1223,14 @@ async def create_redactions_regex( r"""Create redaction annotations based on a regular expression. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) regex: The regular expression to search for redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) regex_options: Optional settings for the regex strategy options: Optional settings for creating redactions @@ -1199,29 +1240,25 @@ async def create_redactions_regex( Example: ```python result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + + # Works with URLs + result = await client.create_redactions_regex( + 'https://example.com/doc.pdf', + r'\b\d{3}-\d{2}-\d{4}\b' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = regex_options.copy() if regex_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_regex( regex, options, strategy_options @@ -1242,7 +1279,7 @@ async def create_redactions_regex( async def create_redactions_text( self, - pdf: FileInput, + pdf: FileInputWithUrl, text: str, redaction_state: Literal["stage", "apply"] = "stage", pages: PageRange | None = None, @@ -1252,11 +1289,14 @@ async def create_redactions_text( """Create redaction annotations based on text. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) text: The text to search for redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) text_options: Optional settings for the text strategy options: Optional settings for creating redactions @@ -1266,29 +1306,25 @@ async def create_redactions_text( Example: ```python result = await client.create_redactions_text('document.pdf', 'email@example.com') + + # Works with URLs + result = await client.create_redactions_text( + 'https://example.com/doc.pdf', + 'CONFIDENTIAL' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = text_options.copy() if text_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_text( text, options, strategy_options @@ -1307,11 +1343,13 @@ async def create_redactions_text( return cast("BufferOutput", self._process_typed_workflow_result(result)) - async def apply_redactions(self, pdf: FileInput) -> BufferOutput: + async def apply_redactions(self, pdf: FileInputWithUrl) -> BufferOutput: """Apply staged redaction into the PDF. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - pdf: The PDF file with redaction annotations to apply + pdf: The PDF file with redaction annotations to apply (URLs supported) Returns: The document with applied redactions @@ -1330,15 +1368,7 @@ async def apply_redactions(self, pdf: FileInput) -> BufferOutput: """ apply_redactions_action = BuildActions.apply_redactions() - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf, None, [apply_redactions_action]) @@ -1350,68 +1380,67 @@ async def apply_redactions(self, pdf: FileInput) -> BufferOutput: async def rotate( self, - pdf: FileInput, + pdf: FileInputWithUrl, angle: Literal[90, 180, 270], pages: PageRange | None = None, ) -> BufferOutput: """Rotate pages in a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to rotate + pdf: The PDF file to rotate (URLs supported) angle: Rotation angle (90, 180, or 270 degrees) - pages: Optional page range to rotate + pages: Optional page range to rotate (supports negative indices) Returns: The entire document with specified pages rotated Example: ```python + # Rotate entire document result = await client.rotate('document.pdf', 90) - # Rotate specific pages: + # Rotate specific pages result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) + + # Rotate with URL + result = await client.rotate('https://example.com/doc.pdf', 90) + + # Rotate last 3 pages using negative indices + result = await client.rotate('document.pdf', 90, {'start': -3, 'end': -1}) ``` """ rotate_action = BuildActions.rotate(angle) - - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - workflow = self.workflow() if pages: - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) + # Use negative index support (-1 = last page) + # No need for client-side PDF parsing + start = pages.get("start", 0) + end = pages.get("end", -1) - # Add pages before the range to rotate - if normalized_pages["start"] > 0: + # Add pages before the rotation range + if start > 0: part_options = cast( "FilePartOptions", - {"pages": {"start": 0, "end": normalized_pages["start"] - 1}}, + {"pages": {"start": 0, "end": start - 1}}, ) workflow = workflow.add_file_part(pdf, part_options) - # Add the specific pages with rotation action - part_options = cast("FilePartOptions", {"pages": normalized_pages}) + # Add the rotation range + part_options = cast( + "FilePartOptions", {"pages": {"start": start, "end": end}} + ) workflow = workflow.add_file_part(pdf, part_options, [rotate_action]) - # Add pages after the range to rotate - if normalized_pages["end"] < page_count - 1: + # Add pages after the rotation range (unless end is -1) + if end != -1: part_options = cast( "FilePartOptions", - { - "pages": { - "start": normalized_pages["end"] + 1, - "end": page_count - 1, - } - }, + {"pages": {"start": end + 1, "end": -1}}, ) workflow = workflow.add_file_part(pdf, part_options) else: @@ -1422,15 +1451,20 @@ async def rotate( return cast("BufferOutput", self._process_typed_workflow_result(result)) async def add_page( - self, pdf: FileInput, count: int = 1, index: int | None = None + self, pdf: FileInputWithUrl, count: int = 1, index: int | None = None ) -> BufferOutput: """Add blank pages to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Index must be non-negative. If the index exceeds the document's page count, + the server will return an error. + Args: - pdf: The PDF file to add pages to + pdf: The PDF file to add pages to (URLs supported) count: The number of blank pages to add index: Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + Must be non-negative. Returns: The document with added pages @@ -1442,71 +1476,55 @@ async def add_page( # Add 1 blank page after the first page (at index 1) result = await client.add_page('document.pdf', 1, 1) + + # Works with URLs + result = await client.add_page('https://example.com/doc.pdf', 3) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # If no index is provided or it's the end of the document, simply add pages at the end + # No client-side PDF validation - let server handle it + # If no index is provided, simply add pages at the end if index is None: builder = self.workflow() - builder.add_file_part(pdf) - - # Add the specified number of blank pages builder = builder.add_new_page({"pageCount": count}) - result = await builder.output_pdf().execute() else: - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Validate that the index is within range - if index < 0 or index > page_count: - raise ValidationError( - f"Index {index} is out of range (document has {page_count} pages)" - ) + # Validate index is non-negative + if index < 0: + raise ValidationError(f"Index must be non-negative, got: {index}") builder = self.workflow() # Add pages before the specified index if index > 0: - before_pages = normalize_page_params( - {"start": 0, "end": index - 1}, page_count - ) + before_pages = normalize_page_params({"start": 0, "end": index - 1}) part_options = cast("FilePartOptions", {"pages": before_pages}) builder = builder.add_file_part(pdf, part_options) # Add the blank pages builder = builder.add_new_page({"pageCount": count}) - # Add pages after the specified index - if index < page_count: - after_pages = normalize_page_params( - {"start": index, "end": page_count - 1}, page_count - ) - part_options = cast("FilePartOptions", {"pages": after_pages}) - builder = builder.add_file_part(pdf, part_options) + # Add pages after the specified index (use -1 for "to end") + after_pages = normalize_page_params({"start": index, "end": -1}) + part_options = cast("FilePartOptions", {"pages": after_pages}) + builder = builder.add_file_part(pdf, part_options) result = await builder.output_pdf().execute() return cast("BufferOutput", self._process_typed_workflow_result(result)) async def split( - self, pdf: FileInput, page_ranges: list[PageRange] + self, pdf: FileInputWithUrl, page_ranges: list[PageRange] ) -> list[BufferOutput]: """Split a PDF document into multiple parts based on page ranges. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to split - page_ranges: Array of page ranges to extract + pdf: The PDF file to split (URLs supported) + page_ranges: Array of page ranges to extract (supports negative indices) Returns: An array of PDF documents, one for each page range @@ -1517,42 +1535,27 @@ async def split( {'start': 0, 'end': 2}, # Pages 0, 1, 2 {'start': 3, 'end': 5} # Pages 3, 4, 5 ]) + + # Works with URLs and negative indices + results = await client.split('https://example.com/doc.pdf', [ + {'start': 0, 'end': 4}, # First 5 pages + {'start': 5, 'end': -1} # Remaining pages to end + ]) ``` """ if not page_ranges or len(page_ranges) == 0: raise ValidationError("At least one page range is required for splitting") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize and validate all page ranges - normalized_ranges = [ - normalize_page_params(page_range, page_count) for page_range in page_ranges - ] - - # Validate that all page ranges are within bounds - for page_range in normalized_ranges: - if page_range["start"] > page_range["end"]: - raise ValidationError( - f"Page range {page_range} is invalid (start > end)" - ) - - # Create a separate workflow for each page range + # No client-side PDF validation - server handles it + # Use negative indices directly - no page count needed import asyncio from typing import cast as typing_cast - async def create_split_pdf(page_range: Pages) -> BufferOutput: + async def create_split_pdf(page_range: PageRange) -> BufferOutput: builder = self.workflow() - part_options = cast("FilePartOptions", {"pages": page_range}) + # Normalize pages to ensure we have start/end + normalized = normalize_page_params(page_range) + part_options = cast("FilePartOptions", {"pages": normalized}) builder = builder.add_file_part(pdf, part_options) result = await builder.output_pdf().execute() return typing_cast( @@ -1560,19 +1563,22 @@ async def create_split_pdf(page_range: Pages) -> BufferOutput: ) # Execute all workflows in parallel and process the results - tasks = [create_split_pdf(page_range) for page_range in normalized_ranges] + tasks = [create_split_pdf(page_range) for page_range in page_ranges] results = await asyncio.gather(*tasks) return results async def duplicate_pages( - self, pdf: FileInput, page_indices: list[int] + self, pdf: FileInputWithUrl, page_indices: list[int] ) -> BufferOutput: """Create a new PDF containing only the specified pages in the order provided. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to extract pages from + pdf: The PDF file to extract pages from (URLs supported) page_indices: Array of page indices to include in the new PDF (0-based) Negative indices count from the end of the document (e.g., -1 is the last page) @@ -1592,45 +1598,20 @@ async def duplicate_pages( # Create a new PDF with the first and last pages result = await client.duplicate_pages('document.pdf', [0, -1]) + + # Works with URLs + result = await client.duplicate_pages('https://example.com/doc.pdf', [0, -1]) ``` """ if not page_indices or len(page_indices) == 0: raise ValidationError("At least one page index is required for duplication") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize negative indices - normalized_indices = [] - for index in page_indices: - if index < 0: - # Handle negative indices (e.g., -1 is the last page) - normalized_indices.append(page_count + index) - else: - normalized_indices.append(index) - - # Validate that all page indices are within range - for i, original_index in enumerate(page_indices): - normalized_index = normalized_indices[i] - if normalized_index < 0 or normalized_index >= page_count: - raise ValidationError( - f"Page index {original_index} is out of range (document has {page_count} pages)" - ) - + # No client-side PDF validation - let server handle it + # Use negative indices directly - server interprets them builder = self.workflow() # Add each page in the order specified - for page_index in normalized_indices: - # Use normalize_page_params to ensure consistent handling + for page_index in page_indices: page_range = normalize_page_params({"start": page_index, "end": page_index}) part_options = cast("FilePartOptions", {"pages": page_range}) builder = builder.add_file_part(pdf, part_options) @@ -1639,13 +1620,16 @@ async def duplicate_pages( return cast("BufferOutput", self._process_typed_workflow_result(result)) async def delete_pages( - self, pdf: FileInput, page_indices: list[int] + self, pdf: FileInputWithUrl, page_indices: list[int] ) -> BufferOutput: """Delete pages from a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) page_indices: Array of page indices to delete (0-based) Negative indices count from the end of the document (e.g., -1 is the last page) @@ -1662,67 +1646,45 @@ async def delete_pages( # Delete the first and last two pages result = await client.delete_pages('document.pdf', [0, -1, -2]) + + # Works with URLs + result = await client.delete_pages('https://example.com/doc.pdf', [0, -1]) ``` """ if not page_indices or len(page_indices) == 0: raise ValidationError("At least one page index is required for deletion") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize negative indices - normalized_indices = [] - for index in page_indices: - if index < 0: - # Handle negative indices (e.g., -1 is the last page) - normalized_indices.append(page_count + index) - else: - normalized_indices.append(index) - - # Remove duplicates and sort the deleteIndices - delete_indices = sorted(set(normalized_indices)) - - # Validate that all page indices are within range - for original_index in page_indices: - if original_index >= 0: - normalized_index = original_index - else: - normalized_index = page_count + original_index - - if normalized_index < 0 or normalized_index >= page_count: - raise ValidationError( - f"Page index {original_index} is out of range (document has {page_count} pages)" - ) + # No client-side PDF validation or page count computation + # Use negative indices directly - server interprets them + # Remove duplicates and sort the delete indices + delete_indices = sorted(set(page_indices)) builder = self.workflow() - # Group consecutive pages that should be kept into ranges + # Build "keep ranges" by finding gaps between deleted indices + # This algorithm works with negative indices - server handles them current_page = 0 page_ranges = [] for delete_index in delete_indices: - if current_page < delete_index: - page_ranges.append( - normalize_page_params( - {"start": current_page, "end": delete_index - 1} + # Skip negative indices in this calculation - they represent "from end" + if delete_index >= 0: + if current_page < delete_index: + page_ranges.append( + normalize_page_params( + {"start": current_page, "end": delete_index - 1} + ) ) - ) - current_page = delete_index + 1 + current_page = delete_index + 1 + # Add remaining pages to end (use -1 for last page) add if we have positive indices processed if ( - current_page > 0 or (current_page == 0 and len(delete_indices) == 0) - ) and current_page < page_count: + current_page >= 0 + and len(delete_indices) > 0 + and any(i >= 0 for i in delete_indices) + ): page_ranges.append( - normalize_page_params({"start": current_page, "end": page_count - 1}) + normalize_page_params({"start": current_page, "end": -1}) ) if len(page_ranges) == 0: @@ -1737,14 +1699,17 @@ async def delete_pages( async def optimize( self, - pdf: FileInput, + pdf: FileInputWithUrl, options: OptimizePdf | None = None, ) -> BufferOutput: """Optimize a PDF document for size reduction. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to optimize + pdf: The PDF file to optimize (URLs supported) options: Optimization options Returns: @@ -1757,17 +1722,12 @@ async def optimize( 'mrcCompression': True, 'imageOptimizationQuality': 2 }) + + # Works with URLs + result = await client.optimize('https://example.com/large.pdf') ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it if options is None: options = {"imageOptimizationQuality": 2} diff --git a/src/nutrient_dws/inputs.py b/src/nutrient_dws/inputs.py index 5acb5cf..125492c 100644 --- a/src/nutrient_dws/inputs.py +++ b/src/nutrient_dws/inputs.py @@ -1,15 +1,17 @@ import contextlib import io import os -import re from pathlib import Path from typing import BinaryIO, TypeGuard from urllib.parse import urlparse import aiofiles -import httpx -FileInput = str | Path | bytes | BinaryIO +# Type definitions for file inputs +# Breaking change in v3.0.0: FileInput no longer includes URL strings +LocalFileInput = Path | bytes | BinaryIO +FileInputWithUrl = str | Path | bytes | BinaryIO +FileInput = LocalFileInput # Breaking change: no longer accepts URL strings NormalizedFileData = tuple[bytes, str] @@ -36,7 +38,7 @@ def is_valid_pdf(file_bytes: bytes) -> bool: return file_bytes.startswith(b"%PDF-") -def is_remote_file_input(file_input: FileInput) -> TypeGuard[str]: +def is_remote_file_input(file_input: FileInputWithUrl) -> TypeGuard[str]: """Check if the file input is a remote URL. Args: @@ -48,7 +50,9 @@ def is_remote_file_input(file_input: FileInput) -> TypeGuard[str]: return isinstance(file_input, str) and is_url(file_input) -async def process_file_input(file_input: FileInput) -> NormalizedFileData: +async def process_file_input( + file_input: LocalFileInput | FileInputWithUrl, +) -> NormalizedFileData: """Convert various file input types to bytes. Args: @@ -140,28 +144,12 @@ async def process_file_input(file_input: FileInput) -> NormalizedFileData: raise ValueError(f"Unsupported file input type: {type(file_input)}") -async def process_remote_file_input(url: str) -> NormalizedFileData: - """Convert various file input types to bytes.""" - async with httpx.AsyncClient() as client: - response = await client.get(url) - # This will raise an exception for bad responses (4xx or 5xx status codes) - response.raise_for_status() - # The .content attribute holds the raw bytes of the response - file_bytes = response.content +# process_remote_file_input() has been removed in v3.0.0 +# URLs are now passed to the server for secure server-side fetching +# This function was removed to prevent SSRF vulnerabilities - filename = "downloaded_file" - # Try to get filename from 'Content-Disposition' header first - header = response.headers.get("content-disposition") - if header: - # Use regex to find a filename in the header - match = re.search(r'filename="?([^"]+)"?', header) - if match: - filename = match.group(1) - return file_bytes, filename - - -def validate_file_input(file_input: FileInput) -> bool: +def validate_file_input(file_input: LocalFileInput | FileInputWithUrl) -> bool: """Validate that the file input is in a supported format. Args: @@ -179,45 +167,7 @@ def validate_file_input(file_input: FileInput) -> bool: return False -def get_pdf_page_count(pdf_bytes: bytes) -> int: - """Zero dependency way to get the number of pages in a PDF. - - Args: - pdf_bytes: PDF file bytes - - Returns: - Number of pages in a PDF. - """ - # Find all PDF objects - objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) - - # Get the Catalog Object - catalog_obj = None - for _obj_num, _gen_num, obj_data in objects: - if b"/Type" in obj_data and b"/Catalog" in obj_data: - catalog_obj = obj_data - break - - if not catalog_obj: - raise ValueError("Could not find /Catalog object in PDF.") - - # Extract /Pages reference (e.g. 3 0 R) - pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) - if not pages_ref_match: - raise ValueError("Could not find /Pages reference in /Catalog.") - pages_obj_num = pages_ref_match.group(1).decode() - pages_obj_gen = pages_ref_match.group(2).decode() - - # Step 3: Find the referenced /Pages object - pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() - pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) - if not pages_obj_match: - raise ValueError("Could not find root /Pages object.") - pages_obj_data = pages_obj_match.group(1) - - # Step 4: Extract /Count - count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) - if not count_match: - raise ValueError("Could not find /Count in root /Pages object.") - - return int(count_match.group(1)) +# get_pdf_page_count() has been removed in v3.0.0 +# The API natively supports negative indices (-1 = last page) +# Client-side PDF parsing is no longer needed +# This removes ~40 lines of code and improves security diff --git a/tests/test_integration.py b/tests/test_integration.py index 3e57352..922ebd4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -306,7 +306,8 @@ async def test_invalid_api_key(self): @pytest.mark.asyncio async def test_network_timeout(self): """Test handling of network timeouts.""" - timeout_client = NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), timeout=1) + # Use an extremely short timeout (1ms) to guarantee a timeout error + timeout_client = NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), timeout=0.001) with pytest.raises(NutrientError): await timeout_client.convert(sample_docx, "pdf") diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index c3873b7..ba5fb27 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -871,22 +871,14 @@ async def test_delete_token(self, mock_send_request, valid_client_options, unit_ class TestNutrientClientFlatten: """Tests for NutrientClient flatten functionality.""" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_all_annotations( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening all annotations (no annotation_ids specified).""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -927,22 +919,14 @@ async def test_flatten_all_annotations( assert result["buffer"] == b"test-buffer" assert result["mimeType"] == "application/pdf" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_string_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations by string IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -977,22 +961,14 @@ async def test_flatten_specific_annotations_by_string_ids( # Verify the result assert result["buffer"] == b"test-buffer" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_integer_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations by integer IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -1027,22 +1003,14 @@ async def test_flatten_specific_annotations_by_integer_ids( # Verify the result assert result["buffer"] == b"test-buffer" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_mixed_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations with mixed string and integer IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() diff --git a/tests/unit/test_inputs.py b/tests/unit/test_inputs.py index e8f126f..bb9714a 100644 --- a/tests/unit/test_inputs.py +++ b/tests/unit/test_inputs.py @@ -5,13 +5,11 @@ import pytest from nutrient_dws.inputs import ( - get_pdf_page_count, - is_remote_file_input, - is_valid_pdf, + is_remote_file_input, # Still used internally process_file_input, - process_remote_file_input, validate_file_input, - FileInput, + LocalFileInput, + FileInputWithUrl, ) from tests.helpers import sample_pdf, TestDocumentGenerator @@ -170,126 +168,6 @@ async def test_throw_for_none(self): await process_file_input(None) -class TestProcessRemoteFileInput: - @pytest.mark.asyncio - async def test_process_url_string_input(self): - mock_response_data = b"test pdf content" - - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.content = mock_response_data - mock_response.headers = {} - mock_response.raise_for_status = Mock(return_value=None) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - result = await process_remote_file_input("https://example.com/test.pdf") - - assert result[0] == mock_response_data - assert result[1] == "downloaded_file" - - @pytest.mark.asyncio - async def test_process_url_with_content_disposition_header(self): - mock_response_data = b"test pdf content" - - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.content = mock_response_data - mock_response.headers = { - "content-disposition": 'attachment; filename="document.pdf"' - } - mock_response.raise_for_status = Mock(return_value=None) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - result = await process_remote_file_input("https://example.com/test.pdf") - - assert result[0] == mock_response_data - assert result[1] == "document.pdf" - - @pytest.mark.asyncio - async def test_throw_error_for_http_error(self): - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.raise_for_status = Mock(side_effect=Exception("HTTP 404")) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - with pytest.raises(Exception): - await process_remote_file_input("https://example.com/test.pdf") - - -class TestGetPdfPageCount: - def test_pdf_with_1_page(self): - pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content("Text") - result = get_pdf_page_count(pdf_bytes) - assert result == 1 - - def test_pdf_with_6_pages(self): - result = get_pdf_page_count(sample_pdf) - assert result == 6 - - def test_throw_for_invalid_pdf_no_objects(self): - invalid_pdf = b"%PDF-1.4\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Catalog object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_invalid_pdf_no_catalog(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /NotCatalog >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Catalog object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_catalog_without_pages_reference(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Pages reference"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_missing_pages_object(self): - invalid_pdf = ( - b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n%%EOF" - ) - - with pytest.raises(ValueError, match="Could not find root /Pages object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_pages_object_without_count(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Count"): - get_pdf_page_count(invalid_pdf) - - -class TestIsValidPdf: - def test_return_true_for_valid_pdf_files(self): - # Test with generated PDF - valid_pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content( - "Test content" - ) - result = is_valid_pdf(valid_pdf_bytes) - assert result is True - - # Test with sample PDF - result = is_valid_pdf(sample_pdf) - assert result is True - - def test_return_false_for_non_pdf_files(self): - # Test with non-PDF bytes - non_pdf_bytes = b"This is not a PDF file" - result = is_valid_pdf(non_pdf_bytes) - assert result is False - - def test_return_false_for_partial_pdf_header(self): - # Test with partial PDF header - partial_pdf = b"%PD" - result = is_valid_pdf(partial_pdf) - assert result is False - - def test_return_false_for_empty_bytes(self): - result = is_valid_pdf(b"") - assert result is False +# Tests for process_remote_file_input, get_pdf_page_count, and is_valid_pdf removed in v3.0.0 +# These functions were removed from the public API for security reasons (SSRF protection) +# and to eliminate client-side PDF parsing (leveraging server-side negative index support)