Skip to content

Commit cb8df7b

Browse files
committed
Merge branch 'feature/aianch/image-rescale' into 'develop'
feature: add image resize config See merge request genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator!184
2 parents 92d9a4e + 387e374 commit cb8df7b

File tree

11 files changed

+2639
-32
lines changed

11 files changed

+2639
-32
lines changed

config_library/pattern-2/default/config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ ocr:
77
- name: LAYOUT
88
- name: TABLES
99
- name: SIGNATURES
10+
image:
11+
target_width: '951'
12+
target_height: '1268'
1013
classes:
1114
- name: letter
1215
description: A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature
@@ -300,6 +303,9 @@ classes:
300303
- name: comments
301304
description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'.
302305
classification:
306+
image:
307+
target_width: '951'
308+
target_height: '1268'
303309
top_p: '0.1'
304310
max_tokens: '4096'
305311
top_k: '5'
@@ -439,6 +445,9 @@ classification:
439445
You are a document classification expert who can analyze and classify multiple documents and their page boundaries within a document package from various domains. Your task is to determine the document type based on its content and structure, using the provided document type definitions. Your output must be valid JSON according to the requested format.
440446
classificationMethod: textbasedHolisticClassification
441447
extraction:
448+
image:
449+
target_width: '951'
450+
target_height: '1268'
442451
top_p: '0.1'
443452
max_tokens: '10000'
444453
top_k: '5'
@@ -594,6 +603,9 @@ summarization:
594603
system_prompt: >-
595604
You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
596605
assessment:
606+
image:
607+
target_width: '951'
608+
target_height: '1268'
597609
default_confidence_threshold: '0.9'
598610
top_p: '0.1'
599611
max_tokens: '10000'

lib/idp_common_pkg/idp_common/assessment/service.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,15 +677,31 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
677677
t2 = time.time()
678678
logger.info(f"Time taken to read text content: {t2 - t1:.2f} seconds")
679679

680-
# Read page images
680+
# Read page images with configurable dimensions
681+
assessment_config = self.config.get("assessment", {})
682+
image_config = assessment_config.get("image", {})
683+
target_width = image_config.get("target_width")
684+
target_height = image_config.get("target_height")
685+
681686
page_images = []
682687
for page_id in sorted_page_ids:
683688
if page_id not in document.pages:
684689
continue
685690

686691
page = document.pages[page_id]
687692
image_uri = page.image_uri
688-
image_content = image.prepare_image(image_uri)
693+
694+
if target_width is not None and target_height is not None:
695+
# Cast to int in case config values are strings
696+
target_width = int(target_width)
697+
target_height = int(target_height)
698+
image_content = image.prepare_image(
699+
image_uri, target_width, target_height
700+
)
701+
else:
702+
image_content = image.prepare_image(
703+
image_uri
704+
) # Uses function defaults
689705
page_images.append(image_content)
690706

691707
t3 = time.time()
@@ -709,7 +725,6 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
709725
logger.info(f"Time taken to read raw OCR results: {t4 - t3:.2f} seconds")
710726

711727
# Get assessment configuration
712-
assessment_config = self.config.get("assessment", {})
713728
model_id = self.config.get("model_id") or assessment_config.get("model")
714729
temperature = _safe_float_conversion(
715730
assessment_config.get("temperature", 0), 0.0

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,10 +605,24 @@ def classify_page_bedrock(
605605
logger.warning(f"Failed to load text content from {text_uri}: {e}")
606606
# Continue without text content
607607

608-
# Load image content from URI
608+
# Load image content from URI with configurable dimensions
609609
if image_uri:
610610
try:
611-
image_content = s3.get_binary_content(image_uri)
611+
image_config = self.config.get("classification", {}).get("image", {})
612+
target_width = image_config.get("target_width")
613+
target_height = image_config.get("target_height")
614+
615+
if target_width is not None and target_height is not None:
616+
# Cast to int in case config values are strings
617+
target_width = int(target_width)
618+
target_height = int(target_height)
619+
image_content = image.prepare_image(
620+
image_uri, target_width, target_height
621+
)
622+
else:
623+
image_content = image.prepare_image(
624+
image_uri
625+
) # Uses function defaults
612626
except Exception as e:
613627
logger.warning(f"Failed to load image content from {image_uri}: {e}")
614628
# Continue without image content

lib/idp_common_pkg/idp_common/extraction/service.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -617,22 +617,36 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
617617
t1 = time.time()
618618
logger.info(f"Time taken to read text content: {t1 - t0:.2f} seconds")
619619

620-
# Read page images
620+
# Read page images with configurable dimensions
621+
extraction_config = self.config.get("extraction", {})
622+
image_config = extraction_config.get("image", {})
623+
target_width = image_config.get("target_width")
624+
target_height = image_config.get("target_height")
625+
621626
page_images = []
622627
for page_id in sorted_page_ids:
623628
if page_id not in document.pages:
624629
continue
625630

626631
page = document.pages[page_id]
627632
image_uri = page.image_uri
628-
image_content = image.prepare_image(image_uri)
633+
if target_width is not None and target_height is not None:
634+
# Cast to int in case config values are strings
635+
target_width = int(target_width)
636+
target_height = int(target_height)
637+
image_content = image.prepare_image(
638+
image_uri, target_width, target_height
639+
)
640+
else:
641+
image_content = image.prepare_image(
642+
image_uri
643+
) # Uses function defaults
629644
page_images.append(image_content)
630645

631646
t2 = time.time()
632647
logger.info(f"Time taken to read images: {t2 - t1:.2f} seconds")
633648

634649
# Get extraction configuration
635-
extraction_config = self.config.get("extraction", {})
636650
model_id = self.config.get("model_id") or extraction_config.get("model")
637651
temperature = float(extraction_config.get("temperature", 0))
638652
top_k = float(extraction_config.get("top_k", 5))

lib/idp_common_pkg/idp_common/image/__init__.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ def resize_image(image_data: bytes,
1414
target_width: int = 951,
1515
target_height: int = 1268) -> bytes:
1616
"""
17-
Resize an image to target dimensions if larger than target
17+
Resize an image to fit within target dimensions while preserving aspect ratio.
18+
No padding, no distortion - pure proportional scaling.
1819
1920
Args:
2021
image_data: Raw image bytes
@@ -26,13 +27,22 @@ def resize_image(image_data: bytes,
2627
"""
2728
image = Image.open(io.BytesIO(image_data))
2829
current_width, current_height = image.size
29-
current_resolution = current_width * current_height
30-
target_resolution = target_width * target_height
3130

32-
if current_resolution > target_resolution:
33-
logger.info(f"Downsizing image from {current_width}x{current_height}")
34-
image = image.resize((target_width, target_height))
31+
# Calculate scaling factor to fit within bounds while preserving aspect ratio
32+
width_ratio = target_width / current_width
33+
height_ratio = target_height / current_height
34+
scale_factor = min(width_ratio, height_ratio) # Fit within bounds
3535

36+
# Only resize if we're making it smaller
37+
if scale_factor < 1.0:
38+
new_width = int(current_width * scale_factor)
39+
new_height = int(current_height * scale_factor)
40+
logger.info(f"Resizing image from {current_width}x{current_height} to {new_width}x{new_height} (scale: {scale_factor:.3f})")
41+
image = image.resize((new_width, new_height), Image.LANCZOS)
42+
else:
43+
logger.debug(f"Image {current_width}x{current_height} already fits within {target_width}x{target_height}, no resizing needed")
44+
45+
# Convert to JPEG bytes
3646
img_byte_array = io.BytesIO()
3747
image.save(img_byte_array, format="JPEG")
3848
return img_byte_array.getvalue()

lib/idp_common_pkg/idp_common/ocr/service.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def __init__(
3232
region: Optional[str] = None,
3333
max_workers: int = 20,
3434
enhanced_features: Union[bool, List[str]] = False,
35+
dpi: int = 300,
36+
resize_config: Optional[Dict[str, Any]] = None,
3537
):
3638
"""
3739
Initialize the OCR service.
@@ -43,12 +45,26 @@ def __init__(
4345
- If False: Uses basic detect_document_text (faster, no features)
4446
- If List[str]: Uses analyze_document with specified features
4547
Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
48+
dpi: DPI (dots per inch) for image generation from PDF pages
49+
resize_config: Optional dictionary containing image resizing configuration
50+
with 'target_width' and 'target_height' keys
4651
4752
Raises:
4853
ValueError: If invalid features are specified in enhanced_features
4954
"""
5055
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
5156
self.max_workers = max_workers
57+
self.dpi = dpi
58+
self.resize_config = resize_config
59+
60+
# Log DPI setting for debugging
61+
logger.info(f"OCR Service initialized with DPI: {self.dpi}")
62+
63+
# Log resize config if provided
64+
if self.resize_config:
65+
logger.info(
66+
f"OCR Service initialized with resize config: {self.resize_config}"
67+
)
5268

5369
# Define valid Textract feature types
5470
VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
@@ -260,12 +276,12 @@ def _process_single_page(
260276
t0 = time.time()
261277
page_id = page_index + 1
262278

263-
# Extract page image
279+
# Extract page image at specified DPI
264280
page = pdf_document.load_page(page_index)
265-
pix = page.get_pixmap()
281+
pix = page.get_pixmap(dpi=self.dpi)
266282
img_bytes = pix.tobytes("jpeg")
267283

268-
# Upload image to S3
284+
# Upload original image to S3
269285
image_key = f"{prefix}/pages/{page_id}/image.jpg"
270286
s3.write_content(img_bytes, output_bucket, image_key, content_type="image/jpeg")
271287

@@ -274,12 +290,25 @@ def _process_single_page(
274290
f"Time for image conversion (page {page_id}): {t1 - t0:.6f} seconds"
275291
)
276292

277-
# Process with Textract
293+
# Resize image for OCR processing if configured
294+
ocr_img_bytes = img_bytes # Default to original image
295+
if self.resize_config:
296+
from idp_common import image
297+
298+
target_width = self.resize_config.get("target_width")
299+
target_height = self.resize_config.get("target_height")
300+
301+
ocr_img_bytes = image.resize_image(img_bytes, target_width, target_height)
302+
logger.debug(
303+
f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}"
304+
)
305+
306+
# Process with OCR using potentially resized image
278307
if isinstance(self.enhanced_features, list) and self.enhanced_features:
279-
textract_result = self._analyze_document(img_bytes, page_id)
308+
textract_result = self._analyze_document(ocr_img_bytes, page_id)
280309
else:
281310
textract_result = self.textract_client.detect_document_text(
282-
Document={"Bytes": img_bytes}
311+
Document={"Bytes": ocr_img_bytes}
283312
)
284313

285314
# Extract metering data

lib/idp_common_pkg/tests/unit/classification/test_classification_service.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,24 @@ def test_prepare_prompt_from_template(self, service):
190190
assert result == "Formatted prompt"
191191

192192
@patch("idp_common.s3.get_text_content")
193-
@patch("idp_common.s3.get_binary_content")
193+
@patch("idp_common.image.prepare_image")
194194
@patch(
195195
"idp_common.classification.service.ClassificationService._invoke_bedrock_model"
196196
)
197197
@patch("idp_common.image.prepare_bedrock_image_attachment")
198198
def test_classify_page_bedrock_success(
199-
self, mock_prepare_image, mock_invoke, mock_get_binary, mock_get_text, service
199+
self,
200+
mock_prepare_bedrock_image,
201+
mock_invoke,
202+
mock_prepare_image,
203+
mock_get_text,
204+
service,
200205
):
201206
"""Test successful page classification with Bedrock."""
202207
# Mock responses
203208
mock_get_text.return_value = "This is an invoice for $100"
204-
mock_get_binary.return_value = b"image_data"
205-
mock_prepare_image.return_value = {"image": "base64_encoded_image"}
209+
mock_prepare_image.return_value = b"image_data"
210+
mock_prepare_bedrock_image.return_value = {"image": "base64_encoded_image"}
206211
mock_invoke.return_value = {
207212
"response": {
208213
"output": {"message": {"content": [{"text": '{"class": "invoice"}'}]}}
@@ -227,8 +232,8 @@ def test_classify_page_bedrock_success(
227232

228233
# Verify calls
229234
mock_get_text.assert_called_once_with("s3://bucket/text.txt")
230-
mock_get_binary.assert_called_once_with("s3://bucket/image.jpg")
231-
mock_prepare_image.assert_called_once_with(b"image_data")
235+
mock_prepare_image.assert_called_once_with("s3://bucket/image.jpg")
236+
mock_prepare_bedrock_image.assert_called_once_with(b"image_data")
232237
mock_invoke.assert_called_once()
233238

234239
@patch("idp_common.s3.get_text_content")

0 commit comments

Comments
 (0)