aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 27 additions & 1 deletion b/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 84 additions & 44 deletions b/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 84 additions & 44 deletions
diff --git a/‎docs/classification.md‎
Lines changed: 98 additions & 1 deletion b/‎docs/classification.md‎
Lines changed: 98 additions & 1 deletion
@@ -5,6 +5,23 @@ SPDX-License-Identifier: MIT-0
 
 ## [Unreleased]
 
+## [0.3.1]
+
+### Added
+
+- **{DOCUMENT_IMAGE} Placeholder Support in Pattern-2**
+  - Added new `{DOCUMENT_IMAGE}` placeholder for precise image positioning in classification and extraction prompts
+  - Enables strategic placement of document images within prompt templates for enhanced multimodal understanding
+  - Supports both single images and multi-page documents (up to 20 images per Bedrock constraints)
+  - Full backward compatibility - existing prompts without placeholder continue to work unchanged
+  - Seamless integration with existing `{FEW_SHOT_EXAMPLES}` functionality
+  - Added warning logging when image limits are exceeded to help with debugging
+  - Enhanced documentation across classification.md, extraction.md, few-shot-examples.md, and pattern-2.md
+
+### Fixed
+- When encountering excessive Bedrock throttling, service returned 'unclassified' instead of retrying, when using multi-modal page level classification method.
+- Minor documentation issues.
+
 ## [0.3.0]
 
 ### Added
 
@@ -1 +1 @@
-0.3.0
+0.3.1
@@ -441,17 +441,25 @@ extraction:
   top_k: '5'
   task_prompt: >-
     <background>
+    
     You are an expert in document analysis and information extraction. 
     You can understand and extract key information from documents classified as type 
+
     {DOCUMENT_CLASS}.
+
     </background>
 
+
     <task>
+
     Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. 
-    Then, extract the relevant information from the text and populate the corresponding values in the JSON object. 
+    Then, extract the relevant information from the text and populate the corresponding values in the JSON object.
+
     </task>
 
+
     <extraction-guidelines>
+
     Guidelines:
         1. Ensure that the data is accurately represented and properly formatted within
         the JSON structure
@@ -474,19 +482,36 @@ extraction:
             - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward.
             - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment.
         10. Think step by step first and then answer.
+
     </extraction-guidelines>
 
+
     <attributes>
+
     {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
     </attributes>
 
+
     <<CACHEPOINT>>
 
+
     <document-text>
+
     {DOCUMENT_TEXT}
+
     </document-text>
+  
+
+    <document_image>
+
+    {DOCUMENT_IMAGE}
+
+    </document_image>
+
 
     <final-instructions>
+
     Extract key information from the document and return a JSON object with the following key steps:
     1. Carefully analyze the document text to identify the requested attributes
     2. Extract only information explicitly found in the document - never make up data
@@ -495,6 +520,7 @@ extraction:
     5. Use null for any fields not found in the document
     6. Ensure the output is properly formatted JSON with quoted keys and values
     7. Think step by step before finalizing your answer
+
     </final-instructions>
   temperature: '0.0'
   model: us.amazon.nova-pro-v1:0
 
@@ -612,6 +612,21 @@ classes:
     description: >-
       A bank statement document containing account information, transactions,
       and financial details
+    attributes:
+      - name: account_holder_name
+        description: >-
+          The name of the account holder.
+      - name: account_name
+        description: >-
+          The name or type of the bank account.
+      - name: account_number
+        description: >-
+          The unique identifier for the bank account. Look for text following
+          'account number', 'account id', or 'account identifier'.
+      - name: transactions
+        description: >-
+          The list of transactions on the account. Look for text following
+          'transactions', 'transaction history', or 'transaction details'.
     examples:
       - classPrompt: Here are example images for each page of a 3 page 'bank-statement '
         name: BankStatement1
@@ -657,22 +672,32 @@ classification:
     {CLASS_NAMES_AND_DESCRIPTIONS}
 
 
-    Respond only with a JSON object containing the class label. For example:
-    {{"class": "letter"}}
-
-    <few_shot_examples>
+    <few_shot_examples> 
 
     {FEW_SHOT_EXAMPLES}
 
     </few_shot_examples>
 
+
     <<CACHEPOINT>>
 
-    <document_ocr_data>
 
-    {DOCUMENT_TEXT}
+    <document_ocr_data> 
+
+    {DOCUMENT_TEXT} 
 
     </document_ocr_data>
+
+
+    <document_image> 
+
+    {DOCUMENT_IMAGE} 
+
+    </document_image>
+
+
+    Respond only with a JSON object containing the class label. For example:
+    {{"class": "letter"}}
 extraction:
   model: us.amazon.nova-pro-v1:0
   temperature: '0.0'
@@ -685,71 +710,86 @@ extraction:
   task_prompt: >
     <background>
 
-    You are an expert in business document analysis and information extraction. 
+    You are an expert in document analysis and information extraction. 
+    You can understand and extract key information from documents classified as type 
 
-    You can understand and extract key information from business documents. 
+    {DOCUMENT_CLASS}.
 
-    <task>
+    </background>
 
-    Your task is to take the unstructured text provided and convert it into a
 
-    well-organized table format using JSON. Identify the main entities,
+    <task>
 
-    attributes, or categories mentioned in the attributes list below and use
+    Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. 
+    Then, extract the relevant information from the text and populate the corresponding values in the JSON object.
 
-    them as keys in the JSON object. 
+    </task>
 
-    Then, extract the relevant information from the text and populate the
 
-    corresponding values in the JSON object. 
+    <extraction-guidelines>
 
     Guidelines:
+        1. Ensure that the data is accurately represented and properly formatted within
+        the JSON structure
+        2. Include double quotes around all keys and values
+        3. Do not make up data - only extract information explicitly found in the
+        document
+        4. Do not use /n for new lines, use a space instead
+        5. If a field is not found or if unsure, return null
+        6. All dates should be in MM/DD/YYYY format
+        7. Do not perform calculations or summations unless totals are explicitly given
+        8. If an alias is not found in the document, return null
+        9. Guidelines for checkboxes:
+         9.A. CAREFULLY examine each checkbox, radio button, and selection field:
+            - Look for marks like ✓, ✗, x, filled circles (●), darkened areas, or handwritten checks indicating selection
+            - For checkboxes and multi-select fields, ONLY INCLUDE options that show clear visual evidence of selection
+            - DO NOT list options that have no visible selection mark
+         9.B. For ambiguous or overlapping tick marks:
+            - If a mark overlaps between two or more checkboxes, determine which option contains the majority of the mark
+            - Consider a checkbox selected if the mark is primarily inside the check box or over the option text
+            - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward.
+            - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment.
+        10. Think step by step first and then answer.
+
+    </extraction-guidelines>
 
-    Ensure that the data is accurately represented and properly formatted within
-    the JSON structure
-
-    Include double quotes around all keys and values
-
-    Do not make up data - only extract information explicitly found in the
-    document
-
-    Do not use /n for new lines, use a space instead
-
-    If a field is not found or if unsure, return null
-
-    All dates should be in MM/DD/YYYY format
-
-    Do not perform calculations or summations unless totals are explicitly given
-
-    If an alias is not found in the document, return null
-
-    Here are the attributes you should extract:
 
     <attributes>
 
     {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
 
     </attributes>
 
-    <few_shot_examples>
 
-    {FEW_SHOT_EXAMPLES}
+    <<CACHEPOINT>>
 
-    </few_shot_examples>
 
-    </task>
+    <document-text>
 
-    </background>
+    {DOCUMENT_TEXT}
 
-    <<CACHEPOINT>>  
+    </document-text>
+  
 
-    The document tpe is {DOCUMENT_CLASS}. Here is the document content:
+    <document_image>
 
-    <document_ocr_data>
+    {DOCUMENT_IMAGE}
 
-    {DOCUMENT_TEXT}
+    </document_image>
 
-    </document_ocr_data>
+
+    <final-instructions>
+
+    Extract key information from the document and return a JSON object with the following key steps:
+    1. Carefully analyze the document text to identify the requested attributes
+    2. Extract only information explicitly found in the document - never make up data
+    3. Format all dates as MM/DD/YYYY and replace newlines with spaces
+    4. For checkboxes, only include options with clear visual selection marks
+    5. Use null for any fields not found in the document
+    6. Ensure the output is properly formatted JSON with quoted keys and values
+    7. Think step by step before finalizing your answer
+
+    </final-instructions>
 pricing:
   - name: textract/detect_document_text
     units:
 
@@ -11,7 +11,7 @@ The solution supports multiple classification approaches that vary by pattern:
 
 ### Pattern 1: BDA-Based Classification
 
-- Classification is performed by the BDA (Business Document Analysis) project configuration
+- Classification is performed by the BDA (Bedrock Data Automation) project configuration
 - Uses BDA blueprints to define classification rules
 - Not configurable inside the GenAIIDP solution itself
 - Configuration happens at the BDA project level
@@ -197,6 +197,103 @@ You can define custom document classes through the Web UI configuration:
    - Detailed description (to guide the classification model)
 5. Save changes
 
+## Image Placement with {DOCUMENT_IMAGE} Placeholder
+
+Pattern 2 supports precise control over where document images are positioned within your classification prompts using the `{DOCUMENT_IMAGE}` placeholder. This feature allows you to specify exactly where images should appear in your prompt template, rather than having them automatically appended at the end.
+
+### How {DOCUMENT_IMAGE} Works
+
+**Without Placeholder (Default Behavior):**
+```yaml
+classification:
+  task_prompt: |
+    Analyze this document:
+    
+    {DOCUMENT_TEXT}
+    
+    Classify it as one of: {CLASS_NAMES_AND_DESCRIPTIONS}
+```
+Images are automatically appended after the text content.
+
+**With Placeholder (Controlled Placement):**
+```yaml
+classification:
+  task_prompt: |
+    Analyze this document:
+    
+    {DOCUMENT_IMAGE}
+    
+    Text content: {DOCUMENT_TEXT}
+    
+    Classify it as one of: {CLASS_NAMES_AND_DESCRIPTIONS}
+```
+Images are inserted exactly where `{DOCUMENT_IMAGE}` appears in the prompt.
+
+### Usage Examples
+
+**Image Before Text Analysis:**
+```yaml
+task_prompt: |
+  Look at this document image first:
+  
+  {DOCUMENT_IMAGE}
+  
+  Now read the extracted text:
+  {DOCUMENT_TEXT}
+  
+  Based on both the visual layout and text content, classify this document as one of:
+  {CLASS_NAMES_AND_DESCRIPTIONS}
+```
+
+**Image in the Middle for Context:**
+```yaml
+task_prompt: |
+  You are classifying business documents. Here are the possible types:
+  {CLASS_NAMES_AND_DESCRIPTIONS}
+  
+  Examine this document image:
+  {DOCUMENT_IMAGE}
+  
+  Additional text content extracted from the document:
+  {DOCUMENT_TEXT}
+  
+  Classification:
+```
+
+### Integration with Few-Shot Examples
+
+The `{DOCUMENT_IMAGE}` placeholder works seamlessly with few-shot examples:
+
+```yaml
+classification:
+  task_prompt: |
+    Here are examples of each document type:
+    {FEW_SHOT_EXAMPLES}
+    
+    Now classify this new document:
+    {DOCUMENT_IMAGE}
+    
+    Text: {DOCUMENT_TEXT}
+    
+    Classification: {CLASS_NAMES_AND_DESCRIPTIONS}
+```
+
+### Benefits
+
+- **🎯 Contextual Placement**: Position images where they provide maximum context
+- **📱 Better Multimodal Understanding**: Help models correlate visual and textual information
+- **🔄 Flexible Prompt Design**: Create prompts that flow naturally between different content types
+- **⚡ Improved Performance**: Strategic image placement can improve classification accuracy
+- **🔒 Backward Compatible**: Existing prompts without the placeholder continue to work unchanged
+
+### Multi-Page Documents
+
+For documents with multiple pages, the system automatically handles image limits:
+
+- **Bedrock Limit**: Maximum 20 images per request (automatically enforced)
+- **Warning Logging**: System logs warnings when images are truncated due to limits
+- **Smart Handling**: Images are processed in page order, with excess images automatically dropped
+
 ## Setting Up Few Shot Examples in Pattern 2
 
 Pattern 2's multimodal page-level classification supports few-shot example prompting, which can significantly improve classification accuracy by providing concrete document examples. This feature is available when you select the 'few_shot_example_with_multimodal_page_classification' configuration.