feature: add image resize config

aianch · aianch · commit f38f6aa0ba2a · 2025-06-20T03:27:06.000+02:00
diff --git a/lib/idp_common_pkg/idp_common/assessment/service.py b/lib/idp_common_pkg/idp_common/assessment/service.py
@@ -677,15 +677,19 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             t2 = time.time()
             logger.info(f"Time taken to read text content: {t2 - t1:.2f} seconds")
 
-            # Read page images
+            # Read page images with configurable dimensions
+            image_config = self.config.get('image', {})
+            target_width = image_config.get('target_width', 951)   # Default fallback
+            target_height = image_config.get('target_height', 1268)
+            
             page_images = []
             for page_id in sorted_page_ids:
                 if page_id not in document.pages:
                     continue
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(image_uri)
+                image_content = image.prepare_image(image_uri, target_width, target_height)
                 page_images.append(image_content)
 
             t3 = time.time()
diff --git a/lib/idp_common_pkg/idp_common/classification/service.py b/lib/idp_common_pkg/idp_common/classification/service.py
@@ -605,10 +605,14 @@ def classify_page_bedrock(
                 logger.warning(f"Failed to load text content from {text_uri}: {e}")
                 # Continue without text content
 
-        # Load image content from URI
+        # Load image content from URI with configurable dimensions
         if image_uri:
             try:
-                image_content = s3.get_binary_content(image_uri)
+                image_config = self.config.get('image', {})
+                target_width = image_config.get('target_width', 951)   # Default fallback
+                target_height = image_config.get('target_height', 1268)
+                
+                image_content = image.prepare_image(image_uri, target_width, target_height)
             except Exception as e:
                 logger.warning(f"Failed to load image content from {image_uri}: {e}")
                 # Continue without image content
diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py
@@ -617,15 +617,19 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             t1 = time.time()
             logger.info(f"Time taken to read text content: {t1 - t0:.2f} seconds")
 
-            # Read page images
+            # Read page images with configurable dimensions
+            image_config = self.config.get('image', {})
+            target_width = image_config.get('target_width', 951)   # Default fallback
+            target_height = image_config.get('target_height', 1268)
+            
             page_images = []
             for page_id in sorted_page_ids:
                 if page_id not in document.pages:
                     continue
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(image_uri)
+                image_content = image.prepare_image(image_uri, target_width, target_height)
                 page_images.append(image_content)
 
             t2 = time.time()
diff --git a/lib/idp_common_pkg/idp_common/image/__init__.py b/lib/idp_common_pkg/idp_common/image/__init__.py
@@ -14,7 +14,8 @@ def resize_image(image_data: bytes,
                 target_width: int = 951, 
                 target_height: int = 1268) -> bytes:
     """
-    Resize an image to target dimensions if larger than target
+    Resize an image to fit within target dimensions while preserving aspect ratio.
+    No padding, no distortion - pure proportional scaling.
     
     Args:
         image_data: Raw image bytes
@@ -26,13 +27,22 @@ def resize_image(image_data: bytes,
     """
     image = Image.open(io.BytesIO(image_data))
     current_width, current_height = image.size
-    current_resolution = current_width * current_height
-    target_resolution = target_width * target_height
     
-    if current_resolution > target_resolution:
-        logger.info(f"Downsizing image from {current_width}x{current_height}")
-        image = image.resize((target_width, target_height))
+    # Calculate scaling factor to fit within bounds while preserving aspect ratio
+    width_ratio = target_width / current_width
+    height_ratio = target_height / current_height
+    scale_factor = min(width_ratio, height_ratio)  # Fit within bounds
     
+    # Only resize if we're making it smaller
+    if scale_factor < 1.0:
+        new_width = int(current_width * scale_factor)
+        new_height = int(current_height * scale_factor)
+        logger.info(f"Resizing image from {current_width}x{current_height} to {new_width}x{new_height} (scale: {scale_factor:.3f})")
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+    else:
+        logger.debug(f"Image {current_width}x{current_height} already fits within {target_width}x{target_height}, no resizing needed")
+    
+    # Convert to JPEG bytes
     img_byte_array = io.BytesIO()
     image.save(img_byte_array, format="JPEG")
     return img_byte_array.getvalue()
diff --git a/lib/idp_common_pkg/idp_common/ocr/service.py b/lib/idp_common_pkg/idp_common/ocr/service.py
@@ -32,6 +32,8 @@ def __init__(
         region: Optional[str] = None,
         max_workers: int = 20,
         enhanced_features: Union[bool, List[str]] = False,
+        dpi: int = 300,
+        resize_config: Optional[Dict[str, Any]] = None,
     ):
         """
         Initialize the OCR service.
@@ -43,12 +45,24 @@ def __init__(
                            - If False: Uses basic detect_document_text (faster, no features)
                            - If List[str]: Uses analyze_document with specified features
                               Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
+            dpi: DPI (dots per inch) for image generation from PDF pages
+            resize_config: Optional dictionary containing image resizing configuration
+                          with 'target_width' and 'target_height' keys
 
         Raises:
             ValueError: If invalid features are specified in enhanced_features
         """
         self.region = region or os.environ.get("AWS_REGION", "us-east-1")
         self.max_workers = max_workers
+        self.dpi = dpi
+        self.resize_config = resize_config
+        
+        # Log DPI setting for debugging
+        logger.info(f"OCR Service initialized with DPI: {self.dpi}")
+        
+        # Log resize config if provided
+        if self.resize_config:
+            logger.info(f"OCR Service initialized with resize config: {self.resize_config}")
 
         # Define valid Textract feature types
         VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
@@ -260,12 +274,12 @@ def _process_single_page(
         t0 = time.time()
         page_id = page_index + 1
 
-        # Extract page image
+        # Extract page image at specified DPI
         page = pdf_document.load_page(page_index)
-        pix = page.get_pixmap()
+        pix = page.get_pixmap(dpi=self.dpi)
         img_bytes = pix.tobytes("jpeg")
 
-        # Upload image to S3
+        # Upload original image to S3
         image_key = f"{prefix}/pages/{page_id}/image.jpg"
         s3.write_content(img_bytes, output_bucket, image_key, content_type="image/jpeg")
 
@@ -274,12 +288,23 @@ def _process_single_page(
             f"Time for image conversion (page {page_id}): {t1 - t0:.6f} seconds"
         )
 
-        # Process with Textract
+        # Resize image for OCR processing if configured
+        ocr_img_bytes = img_bytes  # Default to original image
+        if self.resize_config:
+            target_width = self.resize_config.get('target_width')
+            target_height = self.resize_config.get('target_height')
+            
+            if target_width and target_height:
+                from idp_common import image
+                ocr_img_bytes = image.resize_image(img_bytes, target_width, target_height)
+                logger.debug(f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}")
+
+        # Process with OCR using potentially resized image
         if isinstance(self.enhanced_features, list) and self.enhanced_features:
-            textract_result = self._analyze_document(img_bytes, page_id)
+            textract_result = self._analyze_document(ocr_img_bytes, page_id)
         else:
             textract_result = self.textract_client.detect_document_text(
-                Document={"Bytes": img_bytes}
+                Document={"Bytes": ocr_img_bytes}
             )
 
         # Extract metering data
diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml
@@ -136,10 +136,31 @@ Resources:
             required:
               - features
             properties:
+              image:
+                type: object
+                sectionLabel: Image Processing Settings
+                description: Configure image resolution and processing options for OCR
+                order: 0
+                properties:
+                  target_width:
+                    type: number
+                    description: "Target image width in pixels. Images larger than this will be resized. Default: 1200"
+                    default: 1200
+                    minimum: 100
+                    maximum: 4096
+                    order: 0
+                  target_height:
+                    type: number
+                    description: "Target image height in pixels. Images larger than this will be resized. Default: 1600"
+                    default: 1600
+                    minimum: 100
+                    maximum: 4096
+                    order: 1
               features:
                   type: array
                   listLabel: Features
                   itemLabel: Feature
+                  order: 1
                   items:
                     type: object
                     required:
@@ -370,6 +391,26 @@ Resources:
               - system_prompt
               - task_prompt
             properties:
+              image:
+                type: object
+                sectionLabel: Image Processing Settings
+                description: Configure image resolution and processing options for classification
+                order: 0
+                properties:
+                  target_width:
+                    type: number
+                    description: "Target image width in pixels. Images larger than this will be resized. Default: 800"
+                    default: 800
+                    minimum: 100
+                    maximum: 4096
+                    order: 0
+                  target_height:
+                    type: number
+                    description: "Target image height in pixels. Images larger than this will be resized. Default: 1000"
+                    default: 1000
+                    minimum: 100
+                    maximum: 4096
+                    order: 1
               model:
                 type: string
                 description: Model identifier
@@ -433,6 +474,26 @@ Resources:
               - system_prompt
               - task_prompt
             properties:
+              image:
+                type: object
+                sectionLabel: Image Processing Settings
+                description: Configure image resolution and processing options for extraction
+                order: 0
+                properties:
+                  target_width:
+                    type: number
+                    description: "Target image width in pixels. Images larger than this will be resized. Default: 951"
+                    default: 951
+                    minimum: 100
+                    maximum: 4096
+                    order: 0
+                  target_height:
+                    type: number
+                    description: "Target image height in pixels. Images larger than this will be resized. Default: 1268"
+                    default: 1268
+                    minimum: 100
+                    maximum: 4096
+                    order: 1
               model:
                 type: string
                 description: Model identifier
@@ -482,6 +543,26 @@ Resources:
             type: object
             sectionLabel: Assessment Inference
             properties:
+              image:
+                type: object
+                sectionLabel: Image Processing Settings
+                description: Configure image resolution and processing options for assessment
+                order: 0
+                properties:
+                  target_width:
+                    type: number
+                    description: "Target image width in pixels. Images larger than this will be resized. Default: 800"
+                    default: 800
+                    minimum: 100
+                    maximum: 4096
+                    order: 0
+                  target_height:
+                    type: number
+                    description: "Target image height in pixels. Images larger than this will be resized. Default: 1000"
+                    default: 1000
+                    minimum: 100
+                    maximum: 4096
+                    order: 1
               default_confidence_threshold:
                 type: number
                 description: Default confidence threshold for all attributes (0.0 to 1.0). If an attribute doesn't have its own threshold, this default will be used for confidence threshold alerts.