aws-solutions-library-samples
diff --git a/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 12 additions & 0 deletions b/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/assessment/service.py‎
Lines changed: 18 additions & 3 deletions b/‎lib/idp_common_pkg/idp_common/assessment/service.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 16 additions & 2 deletions b/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/extraction/service.py‎
Lines changed: 17 additions & 3 deletions b/‎lib/idp_common_pkg/idp_common/extraction/service.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/image/__init__.py‎
Lines changed: 16 additions & 6 deletions b/‎lib/idp_common_pkg/idp_common/image/__init__.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 35 additions & 6 deletions b/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 35 additions & 6 deletions
diff --git a/‎lib/idp_common_pkg/tests/unit/classification/test_classification_service.py‎
Lines changed: 11 additions & 6 deletions b/‎lib/idp_common_pkg/tests/unit/classification/test_classification_service.py‎
Lines changed: 11 additions & 6 deletions
@@ -7,6 +7,9 @@ ocr:
     - name: LAYOUT
     - name: TABLES
     - name: SIGNATURES
+  image:
+    target_width: '951'
+    target_height: '1268'
 classes:
   - name: letter
     description: A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature
@@ -300,6 +303,9 @@ classes:
       - name: comments
         description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'.
 classification:
+  image:
+    target_width: '951'
+    target_height: '1268'
   top_p: '0.1'
   max_tokens: '4096'
   top_k: '5'
@@ -439,6 +445,9 @@ classification:
     You are a document classification expert who can analyze and classify multiple documents and their page boundaries within a document package from various domains. Your task is to determine the document type based on its content and structure, using the provided document type definitions. Your output must be valid JSON according to the requested format.
   classificationMethod: textbasedHolisticClassification
 extraction:
+  image:
+    target_width: '951'
+    target_height: '1268'
   top_p: '0.1'
   max_tokens: '10000'
   top_k: '5'
@@ -594,6 +603,9 @@ summarization:
   system_prompt: >-
     You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
 assessment:
+  image:
+    target_width: '951'
+    target_height: '1268'
   default_confidence_threshold: '0.9'
   top_p: '0.1'
   max_tokens: '10000'
 
@@ -677,15 +677,31 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             t2 = time.time()
             logger.info(f"Time taken to read text content: {t2 - t1:.2f} seconds")
 
-            # Read page images
+            # Read page images with configurable dimensions
+            assessment_config = self.config.get("assessment", {})
+            image_config = assessment_config.get("image", {})
+            target_width = image_config.get("target_width")
+            target_height = image_config.get("target_height")
+
             page_images = []
             for page_id in sorted_page_ids:
                 if page_id not in document.pages:
                     continue
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(image_uri)
+
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(
+                        image_uri
+                    )  # Uses function defaults
                 page_images.append(image_content)
 
             t3 = time.time()
@@ -709,7 +725,6 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             logger.info(f"Time taken to read raw OCR results: {t4 - t3:.2f} seconds")
 
             # Get assessment configuration
-            assessment_config = self.config.get("assessment", {})
             model_id = self.config.get("model_id") or assessment_config.get("model")
             temperature = _safe_float_conversion(
                 assessment_config.get("temperature", 0), 0.0
 
@@ -605,10 +605,24 @@ def classify_page_bedrock(
                 logger.warning(f"Failed to load text content from {text_uri}: {e}")
                 # Continue without text content
 
-        # Load image content from URI
+        # Load image content from URI with configurable dimensions
         if image_uri:
             try:
-                image_content = s3.get_binary_content(image_uri)
+                image_config = self.config.get("classification", {}).get("image", {})
+                target_width = image_config.get("target_width")
+                target_height = image_config.get("target_height")
+
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(
+                        image_uri
+                    )  # Uses function defaults
             except Exception as e:
                 logger.warning(f"Failed to load image content from {image_uri}: {e}")
                 # Continue without image content
 
@@ -617,22 +617,36 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             t1 = time.time()
             logger.info(f"Time taken to read text content: {t1 - t0:.2f} seconds")
 
-            # Read page images
+            # Read page images with configurable dimensions
+            extraction_config = self.config.get("extraction", {})
+            image_config = extraction_config.get("image", {})
+            target_width = image_config.get("target_width")
+            target_height = image_config.get("target_height")
+
             page_images = []
             for page_id in sorted_page_ids:
                 if page_id not in document.pages:
                     continue
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(image_uri)
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(
+                        image_uri
+                    )  # Uses function defaults
                 page_images.append(image_content)
 
             t2 = time.time()
             logger.info(f"Time taken to read images: {t2 - t1:.2f} seconds")
 
             # Get extraction configuration
-            extraction_config = self.config.get("extraction", {})
             model_id = self.config.get("model_id") or extraction_config.get("model")
             temperature = float(extraction_config.get("temperature", 0))
             top_k = float(extraction_config.get("top_k", 5))
 
@@ -14,7 +14,8 @@ def resize_image(image_data: bytes,
                 target_width: int = 951, 
                 target_height: int = 1268) -> bytes:
     """
-    Resize an image to target dimensions if larger than target
+    Resize an image to fit within target dimensions while preserving aspect ratio.
+    No padding, no distortion - pure proportional scaling.
     
     Args:
         image_data: Raw image bytes
@@ -26,13 +27,22 @@ def resize_image(image_data: bytes,
     """
     image = Image.open(io.BytesIO(image_data))
     current_width, current_height = image.size
-    current_resolution = current_width * current_height
-    target_resolution = target_width * target_height
 
-    if current_resolution > target_resolution:
-        logger.info(f"Downsizing image from {current_width}x{current_height}")
-        image = image.resize((target_width, target_height))
+    # Calculate scaling factor to fit within bounds while preserving aspect ratio
+    width_ratio = target_width / current_width
+    height_ratio = target_height / current_height
+    scale_factor = min(width_ratio, height_ratio)  # Fit within bounds
 
+    # Only resize if we're making it smaller
+    if scale_factor < 1.0:
+        new_width = int(current_width * scale_factor)
+        new_height = int(current_height * scale_factor)
+        logger.info(f"Resizing image from {current_width}x{current_height} to {new_width}x{new_height} (scale: {scale_factor:.3f})")
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+    else:
+        logger.debug(f"Image {current_width}x{current_height} already fits within {target_width}x{target_height}, no resizing needed")
+    
+    # Convert to JPEG bytes
     img_byte_array = io.BytesIO()
     image.save(img_byte_array, format="JPEG")
     return img_byte_array.getvalue()
 
@@ -32,6 +32,8 @@ def __init__(
         region: Optional[str] = None,
         max_workers: int = 20,
         enhanced_features: Union[bool, List[str]] = False,
+        dpi: int = 300,
+        resize_config: Optional[Dict[str, Any]] = None,
     ):
         """
         Initialize the OCR service.
@@ -43,12 +45,26 @@ def __init__(
                            - If False: Uses basic detect_document_text (faster, no features)
                            - If List[str]: Uses analyze_document with specified features
                               Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
+            dpi: DPI (dots per inch) for image generation from PDF pages
+            resize_config: Optional dictionary containing image resizing configuration
+                          with 'target_width' and 'target_height' keys
 
         Raises:
             ValueError: If invalid features are specified in enhanced_features
         """
         self.region = region or os.environ.get("AWS_REGION", "us-east-1")
         self.max_workers = max_workers
+        self.dpi = dpi
+        self.resize_config = resize_config
+
+        # Log DPI setting for debugging
+        logger.info(f"OCR Service initialized with DPI: {self.dpi}")
+
+        # Log resize config if provided
+        if self.resize_config:
+            logger.info(
+                f"OCR Service initialized with resize config: {self.resize_config}"
+            )
 
         # Define valid Textract feature types
         VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
@@ -260,12 +276,12 @@ def _process_single_page(
         t0 = time.time()
         page_id = page_index + 1
 
-        # Extract page image
+        # Extract page image at specified DPI
         page = pdf_document.load_page(page_index)
-        pix = page.get_pixmap()
+        pix = page.get_pixmap(dpi=self.dpi)
         img_bytes = pix.tobytes("jpeg")
 
-        # Upload image to S3
+        # Upload original image to S3
         image_key = f"{prefix}/pages/{page_id}/image.jpg"
         s3.write_content(img_bytes, output_bucket, image_key, content_type="image/jpeg")
 
@@ -274,12 +290,25 @@ def _process_single_page(
             f"Time for image conversion (page {page_id}): {t1 - t0:.6f} seconds"
         )
 
-        # Process with Textract
+        # Resize image for OCR processing if configured
+        ocr_img_bytes = img_bytes  # Default to original image
+        if self.resize_config:
+            from idp_common import image
+
+            target_width = self.resize_config.get("target_width")
+            target_height = self.resize_config.get("target_height")
+
+            ocr_img_bytes = image.resize_image(img_bytes, target_width, target_height)
+            logger.debug(
+                f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}"
+            )
+
+        # Process with OCR using potentially resized image
         if isinstance(self.enhanced_features, list) and self.enhanced_features:
-            textract_result = self._analyze_document(img_bytes, page_id)
+            textract_result = self._analyze_document(ocr_img_bytes, page_id)
         else:
             textract_result = self.textract_client.detect_document_text(
-                Document={"Bytes": img_bytes}
+                Document={"Bytes": ocr_img_bytes}
             )
 
         # Extract metering data
 
@@ -190,19 +190,24 @@ def test_prepare_prompt_from_template(self, service):
             assert result == "Formatted prompt"
 
     @patch("idp_common.s3.get_text_content")
-    @patch("idp_common.s3.get_binary_content")
+    @patch("idp_common.image.prepare_image")
     @patch(
         "idp_common.classification.service.ClassificationService._invoke_bedrock_model"
     )
     @patch("idp_common.image.prepare_bedrock_image_attachment")
     def test_classify_page_bedrock_success(
-        self, mock_prepare_image, mock_invoke, mock_get_binary, mock_get_text, service
+        self,
+        mock_prepare_bedrock_image,
+        mock_invoke,
+        mock_prepare_image,
+        mock_get_text,
+        service,
     ):
         """Test successful page classification with Bedrock."""
         # Mock responses
         mock_get_text.return_value = "This is an invoice for $100"
-        mock_get_binary.return_value = b"image_data"
-        mock_prepare_image.return_value = {"image": "base64_encoded_image"}
+        mock_prepare_image.return_value = b"image_data"
+        mock_prepare_bedrock_image.return_value = {"image": "base64_encoded_image"}
         mock_invoke.return_value = {
             "response": {
                 "output": {"message": {"content": [{"text": '{"class": "invoice"}'}]}}
@@ -227,8 +232,8 @@ def test_classify_page_bedrock_success(
 
         # Verify calls
         mock_get_text.assert_called_once_with("s3://bucket/text.txt")
-        mock_get_binary.assert_called_once_with("s3://bucket/image.jpg")
-        mock_prepare_image.assert_called_once_with(b"image_data")
+        mock_prepare_image.assert_called_once_with("s3://bucket/image.jpg")
+        mock_prepare_bedrock_image.assert_called_once_with(b"image_data")
         mock_invoke.assert_called_once()
 
     @patch("idp_common.s3.get_text_content")