aws-solutions-library-samples
diff --git a/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 12 additions & 0 deletions b/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/assessment/service.py‎
Lines changed: 14 additions & 7 deletions b/‎lib/idp_common_pkg/idp_common/assessment/service.py‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 12 additions & 6 deletions b/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/extraction/service.py‎
Lines changed: 13 additions & 7 deletions b/‎lib/idp_common_pkg/idp_common/extraction/service.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 6 additions & 9 deletions b/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 6 additions & 9 deletions
@@ -7,6 +7,9 @@ ocr:
     - name: LAYOUT
     - name: TABLES
     - name: SIGNATURES
+  image:
+    target_width: '951'
+    target_height: '1268'
 classes:
   - name: letter
     description: A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature
@@ -300,6 +303,9 @@ classes:
       - name: comments
         description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'.
 classification:
+  image:
+    target_width: '951'
+    target_height: '1268'
   top_p: '0.1'
   max_tokens: '4096'
   top_k: '5'
@@ -439,6 +445,9 @@ classification:
     You are a document classification expert who can analyze and classify multiple documents and their page boundaries within a document package from various domains. Your task is to determine the document type based on its content and structure, using the provided document type definitions. Your output must be valid JSON according to the requested format.
   classificationMethod: textbasedHolisticClassification
 extraction:
+  image:
+    target_width: '951'
+    target_height: '1268'
   top_p: '0.1'
   max_tokens: '10000'
   top_k: '5'
@@ -594,6 +603,9 @@ summarization:
   system_prompt: >-
     You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
 assessment:
+  image:
+    target_width: '951'
+    target_height: 1268
   default_confidence_threshold: '0.9'
   top_p: '0.1'
   max_tokens: '10000'
 
@@ -678,9 +678,10 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             logger.info(f"Time taken to read text content: {t2 - t1:.2f} seconds")
 
             # Read page images with configurable dimensions
-            image_config = self.config.get("image", {})
-            target_width = image_config.get("target_width", 951)  # Default fallback
-            target_height = image_config.get("target_height", 1268)
+            assessment_config = self.config.get("assessment", {})
+            image_config = assessment_config.get("image", {})
+            target_width = image_config.get("target_width")
+            target_height = image_config.get("target_height")
 
             page_images = []
             for page_id in sorted_page_ids:
@@ -689,9 +690,16 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(
-                    image_uri, target_width, target_height
-                )
+                
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(image_uri)  # Uses function defaults
                 page_images.append(image_content)
 
             t3 = time.time()
@@ -715,7 +723,6 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             logger.info(f"Time taken to read raw OCR results: {t4 - t3:.2f} seconds")
 
             # Get assessment configuration
-            assessment_config = self.config.get("assessment", {})
             model_id = self.config.get("model_id") or assessment_config.get("model")
             temperature = _safe_float_conversion(
                 assessment_config.get("temperature", 0), 0.0
 
@@ -609,12 +609,18 @@ def classify_page_bedrock(
         if image_uri:
             try:
                 image_config = self.config.get("image", {})
-                target_width = image_config.get("target_width", 951)  # Default fallback
-                target_height = image_config.get("target_height", 1268)
-
-                image_content = image.prepare_image(
-                    image_uri, target_width, target_height
-                )
+                target_width = image_config.get("target_width")
+                target_height = image_config.get("target_height")
+
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(image_uri)  # Uses function defaults
             except Exception as e:
                 logger.warning(f"Failed to load image content from {image_uri}: {e}")
                 # Continue without image content
 
@@ -618,9 +618,10 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
             logger.info(f"Time taken to read text content: {t1 - t0:.2f} seconds")
 
             # Read page images with configurable dimensions
-            image_config = self.config.get("image", {})
-            target_width = image_config.get("target_width", 951)  # Default fallback
-            target_height = image_config.get("target_height", 1268)
+            extraction_config = self.config.get("extraction", {})
+            image_config = extraction_config.get("image", {})
+            target_width = image_config.get("target_width")
+            target_height = image_config.get("target_height")
 
             page_images = []
             for page_id in sorted_page_ids:
@@ -629,16 +630,21 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
 
                 page = document.pages[page_id]
                 image_uri = page.image_uri
-                image_content = image.prepare_image(
-                    image_uri, target_width, target_height
-                )
+                if target_width is not None and target_height is not None:
+                    # Cast to int in case config values are strings
+                    target_width = int(target_width)
+                    target_height = int(target_height)
+                    image_content = image.prepare_image(
+                        image_uri, target_width, target_height
+                    )
+                else:
+                    image_content = image.prepare_image(image_uri)  # Uses function defaults
                 page_images.append(image_content)
 
             t2 = time.time()
             logger.info(f"Time taken to read images: {t2 - t1:.2f} seconds")
 
             # Get extraction configuration
-            extraction_config = self.config.get("extraction", {})
             model_id = self.config.get("model_id") or extraction_config.get("model")
             temperature = float(extraction_config.get("temperature", 0))
             top_k = float(extraction_config.get("top_k", 5))
 
@@ -293,18 +293,15 @@ def _process_single_page(
         # Resize image for OCR processing if configured
         ocr_img_bytes = img_bytes  # Default to original image
         if self.resize_config:
+            from idp_common import image
+
             target_width = self.resize_config.get("target_width")
             target_height = self.resize_config.get("target_height")
 
-            if target_width and target_height:
-                from idp_common import image
-
-                ocr_img_bytes = image.resize_image(
-                    img_bytes, target_width, target_height
-                )
-                logger.debug(
-                    f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}"
-                )
+            ocr_img_bytes = image.resize_image(img_bytes, target_width, target_height)
+            logger.debug(
+                f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}"
+            )
 
         # Process with OCR using potentially resized image
         if isinstance(self.enhanced_features, list) and self.enhanced_features: