Skip to content

Commit fff7fef

Browse files
author
Bob Strahan
committed
refactor(ocr): simplify OCR service initialization with unified config pattern
1 parent 7d03563 commit fff7fef

File tree

7 files changed

+256
-208
lines changed

7 files changed

+256
-208
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ SPDX-License-Identifier: MIT-0
1414
- Fixed view toggle behavior - switching between views no longer closes the viewer window
1515
- Reordered view buttons to: Markdown View, Text Confidence View, Text View for better user experience
1616

17+
- **Simplified OCR Service Initialization**
18+
- OCR service now accepts a single `config` dictionary parameter for cleaner, more consistent API
19+
- Aligned with classification service pattern for better consistency across IDP services
20+
- Automatic extraction of all OCR settings from configuration dictionary
21+
- Backward compatibility maintained - old parameter pattern still supported with deprecation warning
22+
- Updated all lambda functions and notebooks to use new simplified pattern
23+
- Comprehensive migration guide added to OCR README
24+
1725
### Changed
1826
- **Converted text confidence data format from JSON to markdown table for improved readability and reduced token usage**
1927
- Removed unnecessary "page_count" field

lib/idp_common_pkg/idp_common/ocr/README.md

Lines changed: 91 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,15 @@ The service supports three OCR backends, each with different capabilities and us
5151

5252
## Usage Example
5353

54+
### New Simplified Pattern (Recommended)
55+
5456
```python
55-
from idp_common import ocr
57+
from idp_common import ocr, get_config
5658
from idp_common.models import Document
5759

60+
# Load configuration (typically from DynamoDB)
61+
config = get_config()
62+
5863
# Create or retrieve a Document object with input/output details
5964
document = Document(
6065
id="doc-123",
@@ -63,14 +68,11 @@ document = Document(
6368
output_bucket="output-bucket"
6469
)
6570

66-
# Initialize OCR service
71+
# Initialize OCR service with config dictionary
6772
ocr_service = ocr.OcrService(
6873
region='us-east-1',
69-
max_workers=20,
70-
enhanced_features=False # Default: basic text detection (faster)
71-
# enhanced_features=["TABLES", "FORMS"] # For table and form recognition
72-
# enhanced_features=["LAYOUT"] # For layout analysis
73-
# enhanced_features=["TABLES", "FORMS", "SIGNATURES"] # Multiple features
74+
config=config, # Pass entire config dictionary
75+
backend='textract' # Optional: override backend from config
7476
)
7577

7678
# Process document - this will automatically get the PDF from S3
@@ -84,6 +86,88 @@ for page_id, page in processed_document.pages.items():
8486
print(f"Page {page_id}: Text confidence data at {page.text_confidence_uri}")
8587
```
8688

89+
### Legacy Pattern (Deprecated)
90+
91+
```python
92+
# The old pattern with individual parameters is still supported but deprecated
93+
ocr_service = ocr.OcrService(
94+
region='us-east-1',
95+
max_workers=20,
96+
enhanced_features=False, # or ["TABLES", "FORMS"]
97+
dpi=150,
98+
resize_config={"target_width": 1024, "target_height": 1024},
99+
backend='textract'
100+
)
101+
```
102+
103+
## Configuration Structure
104+
105+
When using the new pattern, the OCR service expects configuration in the following structure:
106+
107+
```yaml
108+
ocr:
109+
backend: "textract" # Options: "textract", "bedrock", "none"
110+
max_workers: 20
111+
dpi: 150
112+
features:
113+
- name: "TABLES"
114+
- name: "FORMS"
115+
image:
116+
target_width: 1024
117+
target_height: 1024
118+
preprocessing: false # Enable adaptive binarization
119+
# For Bedrock backend only:
120+
model_id: "anthropic.claude-3-sonnet-20240229-v1:0"
121+
system_prompt: "You are an OCR system..."
122+
task_prompt: "Extract all text from this image..."
123+
```
124+
125+
## Migration Guide
126+
127+
To migrate from the old pattern to the new pattern:
128+
129+
1. **In Lambda functions:**
130+
```python
131+
# Old pattern
132+
features = [feature['name'] for feature in ocr_config.get("features", [])]
133+
service = ocr.OcrService(
134+
region=region,
135+
max_workers=MAX_WORKERS,
136+
enhanced_features=features,
137+
resize_config=resize_config,
138+
backend=backend
139+
)
140+
141+
# New pattern
142+
config = get_config()
143+
service = ocr.OcrService(
144+
region=region,
145+
config=config,
146+
backend=config.get("ocr", {}).get("backend", "textract")
147+
)
148+
```
149+
150+
2. **In notebooks:**
151+
```python
152+
# Old pattern
153+
ocr_service = ocr.OcrService(
154+
region=region,
155+
enhanced_features=features
156+
)
157+
158+
# New pattern
159+
ocr_service = ocr.OcrService(
160+
region=region,
161+
config=CONFIG # Where CONFIG is your loaded configuration
162+
)
163+
```
164+
165+
The new pattern provides:
166+
- Cleaner, more consistent API across all IDP services
167+
- Easier configuration management
168+
- No need to extract individual parameters
169+
- Future-proof design for adding new features
170+
87171
## Text Confidence Data
88172

89173
The OCR service automatically generates optimized text confidence data for each page, which is specifically designed for LLM assessment prompts. This feature dramatically reduces token usage while preserving all information needed for confidence evaluation.

lib/idp_common_pkg/idp_common/ocr/service.py

Lines changed: 121 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -32,44 +32,136 @@ class OcrService:
3232
def __init__(
3333
self,
3434
region: Optional[str] = None,
35-
max_workers: int = 20,
36-
enhanced_features: Union[bool, List[str]] = False,
35+
config: Optional[Dict[str, Any]] = None,
36+
backend: Optional[str] = None,
37+
max_workers: Optional[int] = None,
38+
# Deprecated parameters for backward compatibility
39+
enhanced_features: Optional[Union[bool, List[str]]] = None,
3740
dpi: Optional[int] = None,
3841
resize_config: Optional[Dict[str, Any]] = None,
39-
bedrock_config: Dict[str, Any] = None,
40-
backend: str = "textract", # New parameter: "textract" or "bedrock"
41-
preprocessing_config: Optional[
42-
Dict[str, Any]
43-
] = None, # New parameter for preprocessing
42+
bedrock_config: Optional[Dict[str, Any]] = None,
43+
preprocessing_config: Optional[Dict[str, Any]] = None,
4444
):
4545
"""
4646
Initialize the OCR service.
4747
4848
Args:
4949
region: AWS region for services
50+
config: Configuration dictionary containing all OCR settings
51+
backend: OCR backend to use ("textract", "bedrock", or "none")
5052
max_workers: Maximum number of concurrent workers for page processing
51-
enhanced_features: Controls Textract FeatureTypes for analyze_document API:
52-
- If False: Uses basic detect_document_text (faster, no features)
53-
- If List[str]: Uses analyze_document with specified features
54-
Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
53+
54+
Deprecated parameters (use config instead):
55+
enhanced_features: Controls Textract FeatureTypes for analyze_document API
5556
dpi: DPI (dots per inch) for image generation from PDF pages
56-
resize_config: Optional dictionary containing image resizing configuration
57-
with 'target_width' and 'target_height' keys
58-
backend: OCR backend to use ("textract" or "bedrock")
59-
bedrock_config: Optional dictionary containing bedrock configuration if backend is "bedrock"
60-
config: Configuration dictionary
57+
resize_config: Image resizing configuration
58+
bedrock_config: Bedrock configuration if backend is "bedrock"
59+
preprocessing_config: Preprocessing configuration
6160
6261
Raises:
63-
ValueError: If invalid features are specified in enhanced_features
64-
or if an invalid backend is specified
62+
ValueError: If invalid features are specified or if an invalid backend is specified
6563
"""
66-
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
67-
self.max_workers = max_workers
68-
self.dpi = dpi
69-
self.resize_config = resize_config
70-
self.backend = backend.lower()
71-
self.bedrock_config = bedrock_config
72-
self.preprocessing_config = preprocessing_config
64+
# Handle backward compatibility
65+
if config is None and any(
66+
[
67+
enhanced_features is not None,
68+
dpi is not None,
69+
resize_config is not None,
70+
bedrock_config is not None,
71+
preprocessing_config is not None,
72+
]
73+
):
74+
logger.warning(
75+
"Using deprecated parameter pattern. Please migrate to using 'config' parameter. "
76+
"See OCR README for migration guide."
77+
)
78+
# Use old parameters
79+
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
80+
self.max_workers = max_workers or 20
81+
self.dpi = dpi
82+
self.resize_config = resize_config
83+
self.backend = (backend or "textract").lower()
84+
self.bedrock_config = bedrock_config
85+
self.preprocessing_config = preprocessing_config
86+
self.enhanced_features = enhanced_features
87+
else:
88+
# New pattern - extract from config
89+
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
90+
self.config = config or {}
91+
ocr_config = self.config.get("ocr", {})
92+
93+
# Extract backend
94+
self.backend = (backend or ocr_config.get("backend", "textract")).lower()
95+
96+
# Extract max_workers
97+
self.max_workers = max_workers or ocr_config.get("max_workers", 20)
98+
99+
# Extract DPI
100+
self.dpi = ocr_config.get("dpi")
101+
102+
# Extract enhanced features
103+
features_config = ocr_config.get("features", [])
104+
if features_config:
105+
self.enhanced_features = [
106+
feature["name"] for feature in features_config
107+
]
108+
else:
109+
self.enhanced_features = False
110+
111+
# Extract image configuration
112+
image_config = ocr_config.get("image", {})
113+
114+
# Extract resize configuration
115+
target_width = image_config.get("target_width")
116+
target_height = image_config.get("target_height")
117+
if target_width is not None and target_height is not None:
118+
# Handle empty strings
119+
if isinstance(target_width, str) and not target_width.strip():
120+
target_width = None
121+
if isinstance(target_height, str) and not target_height.strip():
122+
target_height = None
123+
124+
if target_width is not None and target_height is not None:
125+
try:
126+
self.resize_config = {
127+
"target_width": int(target_width),
128+
"target_height": int(target_height),
129+
}
130+
except (ValueError, TypeError):
131+
logger.warning(
132+
f"Invalid resize configuration values: width={target_width}, height={target_height}"
133+
)
134+
self.resize_config = None
135+
else:
136+
self.resize_config = None
137+
else:
138+
self.resize_config = None
139+
140+
# Extract preprocessing configuration
141+
preprocessing_value = image_config.get("preprocessing")
142+
if preprocessing_value is True or (
143+
isinstance(preprocessing_value, str)
144+
and preprocessing_value.lower() == "true"
145+
):
146+
self.preprocessing_config = {"enabled": True}
147+
else:
148+
self.preprocessing_config = None
149+
150+
# Extract Bedrock configuration
151+
if self.backend == "bedrock":
152+
if all(
153+
key in ocr_config
154+
for key in ["model_id", "system_prompt", "task_prompt"]
155+
):
156+
self.bedrock_config = {
157+
"model_id": ocr_config["model_id"],
158+
"system_prompt": ocr_config["system_prompt"],
159+
"task_prompt": ocr_config["task_prompt"],
160+
}
161+
else:
162+
self.bedrock_config = None
163+
else:
164+
self.bedrock_config = None
73165

74166
# Log DPI setting for debugging
75167
logger.info(f"OCR Service initialized with DPI: {self.dpi}")
@@ -92,11 +184,11 @@ def __init__(
92184
VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
93185

94186
# Validate features if provided as a list
95-
if isinstance(enhanced_features, list):
187+
if isinstance(self.enhanced_features, list):
96188
# Check for invalid features
97189
invalid_features = [
98190
feature
99-
for feature in enhanced_features
191+
for feature in self.enhanced_features
100192
if feature not in VALID_FEATURES
101193
]
102194
if invalid_features:
@@ -106,15 +198,13 @@ def __init__(
106198

107199
# Log the validated features
108200
logger.info(
109-
f"OCR Service initialized with features: {enhanced_features}"
201+
f"OCR Service initialized with features: {self.enhanced_features}"
110202
)
111203

112-
self.enhanced_features = enhanced_features
113-
114204
# Initialize Textract client with adaptive retries
115205
adaptive_config = Config(
116206
retries={"max_attempts": 100, "mode": "adaptive"},
117-
max_pool_connections=max_workers * 3,
207+
max_pool_connections=self.max_workers * 3,
118208
)
119209
self.textract_client = boto3.client(
120210
"textract", region_name=self.region, config=adaptive_config

notebooks/examples/step1_ocr.ipynb

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -110,22 +110,22 @@
110110
"metadata": {},
111111
"outputs": [],
112112
"source": [
113-
"# Extract OCR configuration\n",
114-
"ocr_config = CONFIG.get('ocr', {})\n",
115-
"print(\"OCR Configuration:\")\n",
116-
"print(json.dumps(ocr_config, indent=2))\n",
117-
"\n",
118-
"# Extract features from config\n",
119-
"features = [feature['name'] for feature in ocr_config.get('features', [])]\n",
120-
"print(f\"\\nOCR Features: {features}\")\n",
121-
"\n",
122-
"# Create OCR service with Textract\n",
113+
"# Create OCR service using new simplified pattern\n",
123114
"ocr_service = ocr.OcrService(\n",
124115
" region=env_info['region'],\n",
125-
" enhanced_features=features\n",
116+
" config=CONFIG # Pass entire config dictionary\n",
126117
")\n",
127118
"\n",
128-
"print(\"OCR service initialized\")"
119+
"print(\"OCR service initialized\")\n",
120+
"\n",
121+
"# Display configuration for debugging\n",
122+
"ocr_config = CONFIG.get('ocr', {})\n",
123+
"print(\"\\nOCR Configuration:\")\n",
124+
"print(json.dumps(ocr_config, indent=2))\n",
125+
"\n",
126+
"# Show backend being used\n",
127+
"backend = ocr_config.get('backend', 'textract')\n",
128+
"print(f\"\\nUsing backend: {backend}\")"
129129
]
130130
},
131131
{
@@ -233,7 +233,7 @@
233233
"print(f\"✅ Document processed: {document.id}\")\n",
234234
"print(f\"✅ Pages extracted: {document.num_pages}\")\n",
235235
"print(f\"✅ Processing time: {ocr_time:.2f} seconds\")\n",
236-
"print(f\"Features used: {', '.join(features)}\")\n",
236+
"print(f\"Backend used: {backend}\")\n",
237237
"print(f\"✅ Data saved to: .data/step1_ocr/\")\n",
238238
"print(\"\\n📌 Next step: Run step2_classification.ipynb\")"
239239
]

0 commit comments

Comments
 (0)