aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/README.md‎
Lines changed: 91 additions & 7 deletions b/‎lib/idp_common_pkg/idp_common/ocr/README.md‎
Lines changed: 91 additions & 7 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 121 additions & 31 deletions b/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 121 additions & 31 deletions
diff --git a/‎notebooks/examples/step1_ocr.ipynb‎
Lines changed: 13 additions & 13 deletions b/‎notebooks/examples/step1_ocr.ipynb‎
Lines changed: 13 additions & 13 deletions
@@ -14,6 +14,14 @@ SPDX-License-Identifier: MIT-0
   - Fixed view toggle behavior - switching between views no longer closes the viewer window
   - Reordered view buttons to: Markdown View, Text Confidence View, Text View for better user experience
 
+- **Simplified OCR Service Initialization**
+  - OCR service now accepts a single `config` dictionary parameter for cleaner, more consistent API
+  - Aligned with classification service pattern for better consistency across IDP services
+  - Automatic extraction of all OCR settings from configuration dictionary
+  - Backward compatibility maintained - old parameter pattern still supported with deprecation warning
+  - Updated all lambda functions and notebooks to use new simplified pattern
+  - Comprehensive migration guide added to OCR README
+
 ### Changed
 - **Converted text confidence data format from JSON to markdown table for improved readability and reduced token usage**
   - Removed unnecessary "page_count" field
 
@@ -51,10 +51,15 @@ The service supports three OCR backends, each with different capabilities and us
 
 ## Usage Example
 
+### New Simplified Pattern (Recommended)
+
 ```python
-from idp_common import ocr
+from idp_common import ocr, get_config
 from idp_common.models import Document
 
+# Load configuration (typically from DynamoDB)
+config = get_config()
+
 # Create or retrieve a Document object with input/output details
 document = Document(
     id="doc-123",
@@ -63,14 +68,11 @@ document = Document(
     output_bucket="output-bucket"
 )
 
-# Initialize OCR service
+# Initialize OCR service with config dictionary
 ocr_service = ocr.OcrService(
     region='us-east-1',
-    max_workers=20,
-    enhanced_features=False  # Default: basic text detection (faster)
-    # enhanced_features=["TABLES", "FORMS"]  # For table and form recognition
-    # enhanced_features=["LAYOUT"]  # For layout analysis
-    # enhanced_features=["TABLES", "FORMS", "SIGNATURES"]  # Multiple features
+    config=config,  # Pass entire config dictionary
+    backend='textract'  # Optional: override backend from config
 )
 
 # Process document - this will automatically get the PDF from S3
@@ -84,6 +86,88 @@ for page_id, page in processed_document.pages.items():
     print(f"Page {page_id}: Text confidence data at {page.text_confidence_uri}")
 ```
 
+### Legacy Pattern (Deprecated)
+
+```python
+# The old pattern with individual parameters is still supported but deprecated
+ocr_service = ocr.OcrService(
+    region='us-east-1',
+    max_workers=20,
+    enhanced_features=False,  # or ["TABLES", "FORMS"]
+    dpi=150,
+    resize_config={"target_width": 1024, "target_height": 1024},
+    backend='textract'
+)
+```
+
+## Configuration Structure
+
+When using the new pattern, the OCR service expects configuration in the following structure:
+
+```yaml
+ocr:
+  backend: "textract"  # Options: "textract", "bedrock", "none"
+  max_workers: 20
+  dpi: 150
+  features:
+    - name: "TABLES"
+    - name: "FORMS"
+  image:
+    target_width: 1024
+    target_height: 1024
+    preprocessing: false  # Enable adaptive binarization
+  # For Bedrock backend only:
+  model_id: "anthropic.claude-3-sonnet-20240229-v1:0"
+  system_prompt: "You are an OCR system..."
+  task_prompt: "Extract all text from this image..."
+```
+
+## Migration Guide
+
+To migrate from the old pattern to the new pattern:
+
+1. **In Lambda functions:**
+   ```python
+   # Old pattern
+   features = [feature['name'] for feature in ocr_config.get("features", [])]
+   service = ocr.OcrService(
+       region=region,
+       max_workers=MAX_WORKERS,
+       enhanced_features=features,
+       resize_config=resize_config,
+       backend=backend
+   )
+   
+   # New pattern
+   config = get_config()
+   service = ocr.OcrService(
+       region=region,
+       config=config,
+       backend=config.get("ocr", {}).get("backend", "textract")
+   )
+   ```
+
+2. **In notebooks:**
+   ```python
+   # Old pattern
+   ocr_service = ocr.OcrService(
+       region=region,
+       enhanced_features=features
+   )
+   
+   # New pattern
+   ocr_service = ocr.OcrService(
+       region=region,
+       config=CONFIG  # Where CONFIG is your loaded configuration
+   )
+   ```
+
+The new pattern provides:
+- Cleaner, more consistent API across all IDP services
+- Easier configuration management
+- No need to extract individual parameters
+- Future-proof design for adding new features
+
 ## Text Confidence Data
 
 The OCR service automatically generates optimized text confidence data for each page, which is specifically designed for LLM assessment prompts. This feature dramatically reduces token usage while preserving all information needed for confidence evaluation.
 
@@ -32,44 +32,136 @@ class OcrService:
     def __init__(
         self,
         region: Optional[str] = None,
-        max_workers: int = 20,
-        enhanced_features: Union[bool, List[str]] = False,
+        config: Optional[Dict[str, Any]] = None,
+        backend: Optional[str] = None,
+        max_workers: Optional[int] = None,
+        # Deprecated parameters for backward compatibility
+        enhanced_features: Optional[Union[bool, List[str]]] = None,
         dpi: Optional[int] = None,
         resize_config: Optional[Dict[str, Any]] = None,
-        bedrock_config: Dict[str, Any] = None,
-        backend: str = "textract",  # New parameter: "textract" or "bedrock"
-        preprocessing_config: Optional[
-            Dict[str, Any]
-        ] = None,  # New parameter for preprocessing
+        bedrock_config: Optional[Dict[str, Any]] = None,
+        preprocessing_config: Optional[Dict[str, Any]] = None,
     ):
         """
         Initialize the OCR service.
 
         Args:
             region: AWS region for services
+            config: Configuration dictionary containing all OCR settings
+            backend: OCR backend to use ("textract", "bedrock", or "none")
             max_workers: Maximum number of concurrent workers for page processing
-            enhanced_features: Controls Textract FeatureTypes for analyze_document API:
-                           - If False: Uses basic detect_document_text (faster, no features)
-                           - If List[str]: Uses analyze_document with specified features
-                              Valid features: TABLES, FORMS, SIGNATURES, LAYOUT
+
+            Deprecated parameters (use config instead):
+            enhanced_features: Controls Textract FeatureTypes for analyze_document API
             dpi: DPI (dots per inch) for image generation from PDF pages
-            resize_config: Optional dictionary containing image resizing configuration
-                          with 'target_width' and 'target_height' keys
-            backend: OCR backend to use ("textract" or "bedrock")
-            bedrock_config: Optional dictionary containing bedrock configuration if backend is "bedrock"
-            config: Configuration dictionary
+            resize_config: Image resizing configuration
+            bedrock_config: Bedrock configuration if backend is "bedrock"
+            preprocessing_config: Preprocessing configuration
 
         Raises:
-            ValueError: If invalid features are specified in enhanced_features
-                       or if an invalid backend is specified
+            ValueError: If invalid features are specified or if an invalid backend is specified
         """
-        self.region = region or os.environ.get("AWS_REGION", "us-east-1")
-        self.max_workers = max_workers
-        self.dpi = dpi
-        self.resize_config = resize_config
-        self.backend = backend.lower()
-        self.bedrock_config = bedrock_config
-        self.preprocessing_config = preprocessing_config
+        # Handle backward compatibility
+        if config is None and any(
+            [
+                enhanced_features is not None,
+                dpi is not None,
+                resize_config is not None,
+                bedrock_config is not None,
+                preprocessing_config is not None,
+            ]
+        ):
+            logger.warning(
+                "Using deprecated parameter pattern. Please migrate to using 'config' parameter. "
+                "See OCR README for migration guide."
+            )
+            # Use old parameters
+            self.region = region or os.environ.get("AWS_REGION", "us-east-1")
+            self.max_workers = max_workers or 20
+            self.dpi = dpi
+            self.resize_config = resize_config
+            self.backend = (backend or "textract").lower()
+            self.bedrock_config = bedrock_config
+            self.preprocessing_config = preprocessing_config
+            self.enhanced_features = enhanced_features
+        else:
+            # New pattern - extract from config
+            self.region = region or os.environ.get("AWS_REGION", "us-east-1")
+            self.config = config or {}
+            ocr_config = self.config.get("ocr", {})
+
+            # Extract backend
+            self.backend = (backend or ocr_config.get("backend", "textract")).lower()
+
+            # Extract max_workers
+            self.max_workers = max_workers or ocr_config.get("max_workers", 20)
+
+            # Extract DPI
+            self.dpi = ocr_config.get("dpi")
+
+            # Extract enhanced features
+            features_config = ocr_config.get("features", [])
+            if features_config:
+                self.enhanced_features = [
+                    feature["name"] for feature in features_config
+                ]
+            else:
+                self.enhanced_features = False
+
+            # Extract image configuration
+            image_config = ocr_config.get("image", {})
+
+            # Extract resize configuration
+            target_width = image_config.get("target_width")
+            target_height = image_config.get("target_height")
+            if target_width is not None and target_height is not None:
+                # Handle empty strings
+                if isinstance(target_width, str) and not target_width.strip():
+                    target_width = None
+                if isinstance(target_height, str) and not target_height.strip():
+                    target_height = None
+
+                if target_width is not None and target_height is not None:
+                    try:
+                        self.resize_config = {
+                            "target_width": int(target_width),
+                            "target_height": int(target_height),
+                        }
+                    except (ValueError, TypeError):
+                        logger.warning(
+                            f"Invalid resize configuration values: width={target_width}, height={target_height}"
+                        )
+                        self.resize_config = None
+                else:
+                    self.resize_config = None
+            else:
+                self.resize_config = None
+
+            # Extract preprocessing configuration
+            preprocessing_value = image_config.get("preprocessing")
+            if preprocessing_value is True or (
+                isinstance(preprocessing_value, str)
+                and preprocessing_value.lower() == "true"
+            ):
+                self.preprocessing_config = {"enabled": True}
+            else:
+                self.preprocessing_config = None
+
+            # Extract Bedrock configuration
+            if self.backend == "bedrock":
+                if all(
+                    key in ocr_config
+                    for key in ["model_id", "system_prompt", "task_prompt"]
+                ):
+                    self.bedrock_config = {
+                        "model_id": ocr_config["model_id"],
+                        "system_prompt": ocr_config["system_prompt"],
+                        "task_prompt": ocr_config["task_prompt"],
+                    }
+                else:
+                    self.bedrock_config = None
+            else:
+                self.bedrock_config = None
 
         # Log DPI setting for debugging
         logger.info(f"OCR Service initialized with DPI: {self.dpi}")
@@ -92,11 +184,11 @@ def __init__(
             VALID_FEATURES = ["TABLES", "FORMS", "SIGNATURES", "LAYOUT"]
 
             # Validate features if provided as a list
-            if isinstance(enhanced_features, list):
+            if isinstance(self.enhanced_features, list):
                 # Check for invalid features
                 invalid_features = [
                     feature
-                    for feature in enhanced_features
+                    for feature in self.enhanced_features
                     if feature not in VALID_FEATURES
                 ]
                 if invalid_features:
@@ -106,15 +198,13 @@ def __init__(
 
                 # Log the validated features
                 logger.info(
-                    f"OCR Service initialized with features: {enhanced_features}"
+                    f"OCR Service initialized with features: {self.enhanced_features}"
                 )
 
-            self.enhanced_features = enhanced_features
-
             # Initialize Textract client with adaptive retries
             adaptive_config = Config(
                 retries={"max_attempts": 100, "mode": "adaptive"},
-                max_pool_connections=max_workers * 3,
+                max_pool_connections=self.max_workers * 3,
             )
             self.textract_client = boto3.client(
                 "textract", region_name=self.region, config=adaptive_config
 
@@ -110,22 +110,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Extract OCR configuration\n",
-    "ocr_config = CONFIG.get('ocr', {})\n",
-    "print(\"OCR Configuration:\")\n",
-    "print(json.dumps(ocr_config, indent=2))\n",
-    "\n",
-    "# Extract features from config\n",
-    "features = [feature['name'] for feature in ocr_config.get('features', [])]\n",
-    "print(f\"\\nOCR Features: {features}\")\n",
-    "\n",
-    "# Create OCR service with Textract\n",
+    "# Create OCR service using new simplified pattern\n",
     "ocr_service = ocr.OcrService(\n",
     "    region=env_info['region'],\n",
-    "    enhanced_features=features\n",
+    "    config=CONFIG  # Pass entire config dictionary\n",
     ")\n",
     "\n",
-    "print(\"OCR service initialized\")"
+    "print(\"OCR service initialized\")\n",
+    "\n",
+    "# Display configuration for debugging\n",
+    "ocr_config = CONFIG.get('ocr', {})\n",
+    "print(\"\\nOCR Configuration:\")\n",
+    "print(json.dumps(ocr_config, indent=2))\n",
+    "\n",
+    "# Show backend being used\n",
+    "backend = ocr_config.get('backend', 'textract')\n",
+    "print(f\"\\nUsing backend: {backend}\")"
    ]
   },
   {
@@ -233,7 +233,7 @@
     "print(f\"✅ Document processed: {document.id}\")\n",
     "print(f\"✅ Pages extracted: {document.num_pages}\")\n",
     "print(f\"✅ Processing time: {ocr_time:.2f} seconds\")\n",
-    "print(f\"✅ Features used: {', '.join(features)}\")\n",
+    "print(f\"✅ Backend used: {backend}\")\n",
     "print(f\"✅ Data saved to: .data/step1_ocr/\")\n",
     "print(\"\\n📌 Next step: Run step2_classification.ipynb\")"
    ]