Skip to content

Commit f38f6aa

Browse files
committed
feature: add image resize config
1 parent 6ec447f commit f38f6aa

File tree

6 files changed

+146
-18
lines changed

6 files changed

+146
-18
lines changed

lib/idp_common_pkg/idp_common/assessment/service.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -677,15 +677,19 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
677677
t2 = time.time()
678678
logger.info(f"Time taken to read text content: {t2 - t1:.2f} seconds")
679679

680-
# Read page images
680+
# Read page images with configurable dimensions
681+
image_config = self.config.get('image', {})
682+
target_width = image_config.get('target_width', 951) # Default fallback
683+
target_height = image_config.get('target_height', 1268)
684+
681685
page_images = []
682686
for page_id in sorted_page_ids:
683687
if page_id not in document.pages:
684688
continue
685689

686690
page = document.pages[page_id]
687691
image_uri = page.image_uri
688-
image_content = image.prepare_image(image_uri)
692+
image_content = image.prepare_image(image_uri, target_width, target_height)
689693
page_images.append(image_content)
690694

691695
t3 = time.time()

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,10 +605,14 @@ def classify_page_bedrock(
605605
logger.warning(f"Failed to load text content from {text_uri}: {e}")
606606
# Continue without text content
607607

608-
# Load image content from URI
608+
# Load image content from URI with configurable dimensions
609609
if image_uri:
610610
try:
611-
image_content = s3.get_binary_content(image_uri)
611+
image_config = self.config.get('image', {})
612+
target_width = image_config.get('target_width', 951) # Default fallback
613+
target_height = image_config.get('target_height', 1268)
614+
615+
image_content = image.prepare_image(image_uri, target_width, target_height)
612616
except Exception as e:
613617
logger.warning(f"Failed to load image content from {image_uri}: {e}")
614618
# Continue without image content

lib/idp_common_pkg/idp_common/extraction/service.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -617,15 +617,19 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
617617
t1 = time.time()
618618
logger.info(f"Time taken to read text content: {t1 - t0:.2f} seconds")
619619

620-
# Read page images
620+
# Read page images with configurable dimensions
621+
image_config = self.config.get('image', {})
622+
target_width = image_config.get('target_width', 951) # Default fallback
623+
target_height = image_config.get('target_height', 1268)
624+
621625
page_images = []
622626
for page_id in sorted_page_ids:
623627
if page_id not in document.pages:
624628
continue
625629

626630
page = document.pages[page_id]
627631
image_uri = page.image_uri
628-
image_content = image.prepare_image(image_uri)
632+
image_content = image.prepare_image(image_uri, target_width, target_height)
629633
page_images.append(image_content)
630634

631635
t2 = time.time()

lib/idp_common_pkg/idp_common/image/__init__.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ def resize_image(image_data: bytes,
1414
target_width: int = 951,
1515
target_height: int = 1268) -> bytes:
1616
"""
17-
Resize an image to target dimensions if larger than target
17+
Resize an image to fit within target dimensions while preserving aspect ratio.
18+
No padding, no distortion - pure proportional scaling.
1819
1920
Args:
2021
image_data: Raw image bytes
@@ -26,13 +27,22 @@ def resize_image(image_data: bytes,
2627
"""
2728
image = Image.open(io.BytesIO(image_data))
2829
current_width, current_height = image.size
29-
current_resolution = current_width * current_height
30-
target_resolution = target_width * target_height
3130

32-
if current_resolution > target_resolution:
33-
logger.info(f"Downsizing image from {current_width}x{current_height}")
34-
image = image.resize((target_width, target_height))
31+
# Calculate scaling factor to fit within bounds while preserving aspect ratio
32+
width_ratio = target_width / current_width
33+
height_ratio = target_height / current_height
34+
scale_factor = min(width_ratio, height_ratio) # Fit within bounds
3535

36+
# Only resize if we're making it smaller
37+
if scale_factor < 1.0:
38+
new_width = int(current_width * scale_factor)
39+
new_height = int(current_height * scale_factor)
40+
logger.info(f"Resizing image from {current_width}x{current_height} to {new_width}x{new_height} (scale: {scale_factor:.3f})")
41+
image = image.resize((new_width, new_height), Image.LANCZOS)
42+
else:
43+
logger.debug(f"Image {current_width}x{current_height} already fits within {target_width}x{target_height}, no resizing needed")
44+
45+
# Convert to JPEG bytes
3646
img_byte_array = io.BytesIO()
3747
image.save(img_byte_array, format="JPEG")
3848
return img_byte_array.getvalue()

lib/idp_common_pkg/idp_common/ocr/service.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def __init__(
3232
region: Optional[str] = None,
3333
max_workers: int = 20,
3434
enhanced_features: Union[bool, List[str]] = False,
35+
dpi: int = 300,
36+
resize_config: Optional[Dict[str, Any]] = None,
3537
):
3638
"""
3739
Initialize the OCR service.
@@ -43,12 +45,24 @@ def __init__(
4345
- If False: Uses basic detect_document_text (faster, no features)
4446
- If List[str]: Uses analyze_document with specified features
4547
Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
48+
dpi: DPI (dots per inch) for image generation from PDF pages
49+
resize_config: Optional dictionary containing image resizing configuration
50+
with 'target_width' and 'target_height' keys
4651
4752
Raises:
4853
ValueError: If invalid features are specified in enhanced_features
4954
"""
5055
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
5156
self.max_workers = max_workers
57+
self.dpi = dpi
58+
self.resize_config = resize_config
59+
60+
# Log DPI setting for debugging
61+
logger.info(f"OCR Service initialized with DPI: {self.dpi}")
62+
63+
# Log resize config if provided
64+
if self.resize_config:
65+
logger.info(f"OCR Service initialized with resize config: {self.resize_config}")
5266

5367
# Define valid Textract feature types
5468
VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
@@ -260,12 +274,12 @@ def _process_single_page(
260274
t0 = time.time()
261275
page_id = page_index + 1
262276

263-
# Extract page image
277+
# Extract page image at specified DPI
264278
page = pdf_document.load_page(page_index)
265-
pix = page.get_pixmap()
279+
pix = page.get_pixmap(dpi=self.dpi)
266280
img_bytes = pix.tobytes("jpeg")
267281

268-
# Upload image to S3
282+
# Upload original image to S3
269283
image_key = f"{prefix}/pages/{page_id}/image.jpg"
270284
s3.write_content(img_bytes, output_bucket, image_key, content_type="image/jpeg")
271285

@@ -274,12 +288,23 @@ def _process_single_page(
274288
f"Time for image conversion (page {page_id}): {t1 - t0:.6f} seconds"
275289
)
276290

277-
# Process with Textract
291+
# Resize image for OCR processing if configured
292+
ocr_img_bytes = img_bytes # Default to original image
293+
if self.resize_config:
294+
target_width = self.resize_config.get('target_width')
295+
target_height = self.resize_config.get('target_height')
296+
297+
if target_width and target_height:
298+
from idp_common import image
299+
ocr_img_bytes = image.resize_image(img_bytes, target_width, target_height)
300+
logger.debug(f"Resized image for OCR processing (page {page_id}) to {target_width}x{target_height}")
301+
302+
# Process with OCR using potentially resized image
278303
if isinstance(self.enhanced_features, list) and self.enhanced_features:
279-
textract_result = self._analyze_document(img_bytes, page_id)
304+
textract_result = self._analyze_document(ocr_img_bytes, page_id)
280305
else:
281306
textract_result = self.textract_client.detect_document_text(
282-
Document={"Bytes": img_bytes}
307+
Document={"Bytes": ocr_img_bytes}
283308
)
284309

285310
# Extract metering data

patterns/pattern-2/template.yaml

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,31 @@ Resources:
136136
required:
137137
- features
138138
properties:
139+
image:
140+
type: object
141+
sectionLabel: Image Processing Settings
142+
description: Configure image resolution and processing options for OCR
143+
order: 0
144+
properties:
145+
target_width:
146+
type: number
147+
description: "Target image width in pixels. Images larger than this will be resized. Default: 1200"
148+
default: 1200
149+
minimum: 100
150+
maximum: 4096
151+
order: 0
152+
target_height:
153+
type: number
154+
description: "Target image height in pixels. Images larger than this will be resized. Default: 1600"
155+
default: 1600
156+
minimum: 100
157+
maximum: 4096
158+
order: 1
139159
features:
140160
type: array
141161
listLabel: Features
142162
itemLabel: Feature
163+
order: 1
143164
items:
144165
type: object
145166
required:
@@ -370,6 +391,26 @@ Resources:
370391
- system_prompt
371392
- task_prompt
372393
properties:
394+
image:
395+
type: object
396+
sectionLabel: Image Processing Settings
397+
description: Configure image resolution and processing options for classification
398+
order: 0
399+
properties:
400+
target_width:
401+
type: number
402+
description: "Target image width in pixels. Images larger than this will be resized. Default: 800"
403+
default: 800
404+
minimum: 100
405+
maximum: 4096
406+
order: 0
407+
target_height:
408+
type: number
409+
description: "Target image height in pixels. Images larger than this will be resized. Default: 1000"
410+
default: 1000
411+
minimum: 100
412+
maximum: 4096
413+
order: 1
373414
model:
374415
type: string
375416
description: Model identifier
@@ -433,6 +474,26 @@ Resources:
433474
- system_prompt
434475
- task_prompt
435476
properties:
477+
image:
478+
type: object
479+
sectionLabel: Image Processing Settings
480+
description: Configure image resolution and processing options for extraction
481+
order: 0
482+
properties:
483+
target_width:
484+
type: number
485+
description: "Target image width in pixels. Images larger than this will be resized. Default: 951"
486+
default: 951
487+
minimum: 100
488+
maximum: 4096
489+
order: 0
490+
target_height:
491+
type: number
492+
description: "Target image height in pixels. Images larger than this will be resized. Default: 1268"
493+
default: 1268
494+
minimum: 100
495+
maximum: 4096
496+
order: 1
436497
model:
437498
type: string
438499
description: Model identifier
@@ -482,6 +543,26 @@ Resources:
482543
type: object
483544
sectionLabel: Assessment Inference
484545
properties:
546+
image:
547+
type: object
548+
sectionLabel: Image Processing Settings
549+
description: Configure image resolution and processing options for assessment
550+
order: 0
551+
properties:
552+
target_width:
553+
type: number
554+
description: "Target image width in pixels. Images larger than this will be resized. Default: 800"
555+
default: 800
556+
minimum: 100
557+
maximum: 4096
558+
order: 0
559+
target_height:
560+
type: number
561+
description: "Target image height in pixels. Images larger than this will be resized. Default: 1000"
562+
default: 1000
563+
minimum: 100
564+
maximum: 4096
565+
order: 1
485566
default_confidence_threshold:
486567
type: number
487568
description: Default confidence threshold for all attributes (0.0 to 1.0). If an attribute doesn't have its own threshold, this default will be used for confidence threshold alerts.

0 commit comments

Comments
 (0)