aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 108 additions & 1 deletion b/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 108 additions & 1 deletion
diff --git a/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 104 additions & 1 deletion b/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 104 additions & 1 deletion
@@ -5,6 +5,38 @@ SPDX-License-Identifier: MIT-0
 
 ## [Unreleased]
 
+## [0.3.3]
+
+### Added
+
+- **Assessment Feature for Extraction Confidence Evaluation (EXPERIMENTAL)**
+  - Added new assessment service that evaluates extraction confidence using LLMs to analyze extraction results against source documents
+  - Multi-modal assessment capability combining text analysis with document images for comprehensive confidence scoring
+  - UI integration with explainability_info display showing per-attribute confidence scores, thresholds, and explanations
+  - Optional deployment controlled by `IsAssessmentEnabled` parameter (defaults to false)
+  - Added e2e-example-with-assessment.ipynb notebook for testing assessment workflow
+
+- **Enhanced Evaluation Framework with Confidence Integration**
+  - Added expected_confidence and actual_confidence fields to evaluation reports for quality analysis
+  - Automatic extraction and display of confidence scores from assessment explainability_info
+  - Enhanced JSON and Markdown evaluation reports with confidence columns
+  - Backward compatible integration - shows "N/A" when confidence data unavailable
+
+- **Evaluation Analytics Database and Reporting System**
+  - Added comprehensive ReportingDatabase (AWS Glue) with structured evaluation metrics storage
+  - Three-tier analytics tables: document_evaluations, section_evaluations, and attribute_evaluations
+  - Automatic partitioning by date and document for efficient querying with Amazon Athena
+  - Detailed metrics tracking including accuracy, precision, recall, F1 score, execution time, and evaluation methods
+  - Added evaluation_reporting_analytics.ipynb notebook for comprehensive performance analysis and visualization
+  - Multi-level analytics with document, section, and attribute-level insights
+  - Visual dashboards showing accuracy distributions, performance trends, and problematic patterns
+  - Configurable filters for date ranges, document types, and evaluation thresholds
+  - Integration with existing evaluation framework - metrics automatically saved to database
+  - ReportingDatabase output added to CloudFormation template for easy reference
+
+### Fixed
+- Fixed build failure related to pandas and numpy dependency conflicts in the idp_common_pkg package
+
 ## [0.3.2]
 
 ### Added
 
@@ -35,6 +35,7 @@ A scalable, serverless solution for automated document processing and informatio
 - **Comprehensive Monitoring**: Rich CloudWatch dashboard with detailed metrics and logs
 - **Web User Interface**: Modern UI for inspecting document workflow status and results
 - **AI-Powered Evaluation**: Framework to assess accuracy against baseline data
+- **Extraction Confidence Assessment**: LLM-powered assessment of extraction confidence with multimodal document analysis
 - **Document Knowledge Base Query**: Ask questions about your processed documents
 
 ## Architecture Overview
@@ -110,11 +111,12 @@ For detailed deployment and testing instructions, see the [Deployment Guide](./d
 - [Architecture](./docs/architecture.md) - Detailed component architecture and data flow
 - [Deployment](./docs/deployment.md) - Build, publish, deploy, and test instructions
 - [Web UI](./docs/web-ui.md) - Web interface features and usage
-- [Knowledge Base](./docs/knowledge-base.md) - Document knowledge base query feature
-- [Evaluation Framework](./docs/evaluation.md) - Accuracy assessment system
 - [Configuration](./docs/configuration.md) - Configuration and customization options
 - [Classification](./docs/classification.md) - Customizing document classification
 - [Extraction](./docs/extraction.md) - Customizing information extraction
+- [Assessment](./docs/assessment.md) - Extraction confidence evaluation using LLMs
+- [Evaluation Framework](./docs/evaluation.md) - Accuracy assessment system with analytics database and reporting
+- [Knowledge Base](./docs/knowledge-base.md) - Document knowledge base query feature
 - [Monitoring](./docs/monitoring.md) - Monitoring and logging capabilities
 - [Troubleshooting](./docs/troubleshooting.md) - Troubleshooting and performance guides
 
 
@@ -1 +1 @@
-0.3.3-alpha
+0.3.3-beta
@@ -13,10 +13,13 @@ classes:
     attributes:
       - name: sender_name
         description: The name of the person or entity who wrote or sent the letter. Look for text following or near terms like 'from', 'sender', 'authored by', 'written by', or at the end of the letter before a signature.
+        confidence_threshold: '0.85'
       - name: sender_address
         description: The physical address of the sender, typically appearing at the top of the letter. May be labeled as 'address', 'location', or 'from address'.
+        confidence_threshold: '0.8'
       - name: recipient_name
         description: The name of the person or entity receiving the letter. Look for this after 'to', 'recipient', 'addressee', or at the beginning of the letter.
+        confidence_threshold: '0.9'
       - name: recipient_address
         description: The physical address where the letter is to be delivered. Often labeled as 'to address' or 'delivery address', typically appearing below the recipient name.
       - name: date
@@ -587,6 +590,110 @@ summarization:
   model: us.anthropic.claude-3-7-sonnet-20250219-v1:0
   system_prompt: >-
     You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
+assessment:
+  default_confidence_threshold: '0.9'
+  top_p: '0.1'
+  max_tokens: '4096'
+  top_k: '5'
+  task_prompt: >-
+    <background>
+
+    You are an expert document analysis assessment system. Your task is to evaluate the confidence and accuracy of extraction results for a document of class {DOCUMENT_CLASS}.
+
+    </background>
+
+
+    <task>
+
+    Analyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:
+
+    1. Text clarity and OCR quality in the source regions
+    2. Alignment between extracted values and document content
+    3. Presence of clear evidence supporting the extraction
+    4. Potential ambiguity or uncertainty in the source material
+    5. Completeness and accuracy of the extracted information
+
+    </task>
+
+
+    <assessment-guidelines>
+
+    For each attribute, provide:
+    1. A confidence score between 0.0 and 1.0 where:
+       - 1.0 = Very high confidence, clear and unambiguous evidence
+       - 0.8-0.9 = High confidence, strong evidence with minor uncertainty
+       - 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity
+       - 0.4-0.5 = Low confidence, weak or unclear evidence
+       - 0.0-0.3 = Very low confidence, little to no supporting evidence
+
+    2. A clear reason explaining the confidence score, including:
+       - What evidence supports or contradicts the extraction
+       - Any OCR quality issues that affect confidence
+       - Clarity of the source document in relevant areas
+       - Any ambiguity or uncertainty factors
+
+    Guidelines:
+    - Base assessments on actual document content and OCR quality
+    - Consider both text-based evidence and visual/layout clues
+    - Account for OCR confidence scores when provided
+    - Be objective and specific in reasoning
+    - If an extraction appears incorrect, score accordingly with explanation
+
+    </assessment-guidelines>
+
+    <attributes-definitions>
+
+    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
+    </attributes-definitions>
+
+
+    <<CACHEPOINT>>
+
+
+    <extraction-results>
+
+    {EXTRACTION_RESULTS}
+
+    </extraction-results>
+
+
+    <document-image>
+
+    {DOCUMENT_IMAGE}
+
+    </document-image>
+
+
+    <ocr-text-confidence-results>
+
+    {OCR_TEXT_CONFIDENCE}
+
+    </ocr-text-confidence-results>
+
+
+    <final-instructions>
+
+    Analyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure:
+
+      {
+        "attribute_name_1": {
+          "confidence": 0.85,
+          "confidence_reason": "Clear text evidence found in document header with high OCR confidence (0.98). Value matches exactly."
+        },
+        "attribute_name_2": {
+          "confidence": 0.65,
+          "confidence_reason": "Text is partially unclear due to poor scan quality. OCR confidence low (0.72) in this region."
+        }
+      }
+
+    Include assessments for ALL attributes present in the extraction results.
+
+    </final-instructions>
+  temperature: '0.0'
+  model: us.amazon.nova-pro-v1:0
+  system_prompt: >-
+    You are a document analysis assessment expert. Your task is to evaluate the confidence and accuracy of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores and reasoning for each extracted attribute.
 evaluation:
   llm_method:
     top_p: '0.1'
@@ -622,7 +729,7 @@ evaluation:
         "reason": "Your explanation here"
       }
     temperature: '0.0'
-    model: us.anthropic.claude-3-5-sonnet-20241022-v2:0
+    model: us.anthropic.claude-3-haiku-20240307-v1:0
     system_prompt: >-
       You are an evaluator that helps determine if the predicted and expected values match for document attribute extraction. You will consider the context and meaning rather than just exact string matching.
 pricing:
 
@@ -877,6 +877,109 @@ pricing:
         price: '3.0E-7'
       - name: cacheWriteInputTokens
         price: '3.75E-6'
+assessment:
+  top_p: '0.1'
+  max_tokens: '4096'
+  top_k: '5'
+  task_prompt: >-
+    <background>
+
+    You are an expert document analysis assessment system. Your task is to evaluate the confidence and accuracy of extraction results for a document of class {DOCUMENT_CLASS}.
+
+    </background>
+
+
+    <task>
+
+    Analyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:
+
+    1. Text clarity and OCR quality in the source regions
+    2. Alignment between extracted values and document content
+    3. Presence of clear evidence supporting the extraction
+    4. Potential ambiguity or uncertainty in the source material
+    5. Completeness and accuracy of the extracted information
+
+    </task>
+
+
+    <assessment-guidelines>
+
+    For each attribute, provide:
+    1. A confidence score between 0.0 and 1.0 where:
+       - 1.0 = Very high confidence, clear and unambiguous evidence
+       - 0.8-0.9 = High confidence, strong evidence with minor uncertainty
+       - 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity
+       - 0.4-0.5 = Low confidence, weak or unclear evidence
+       - 0.0-0.3 = Very low confidence, little to no supporting evidence
+
+    2. A clear reason explaining the confidence score, including:
+       - What evidence supports or contradicts the extraction
+       - Any OCR quality issues that affect confidence
+       - Clarity of the source document in relevant areas
+       - Any ambiguity or uncertainty factors
+
+    Guidelines:
+    - Base assessments on actual document content and OCR quality
+    - Consider both text-based evidence and visual/layout clues
+    - Account for OCR confidence scores when provided
+    - Be objective and specific in reasoning
+    - If an extraction appears incorrect, score accordingly with explanation
+
+    </assessment-guidelines>
+
+    <attributes-definitions>
+
+    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
+    </attributes-definitions>
+
+
+    <<CACHEPOINT>>
+
+
+    <extraction-results>
+
+    {EXTRACTION_RESULTS}
+
+    </extraction-results>
+
+
+    <document-image>
+
+    {DOCUMENT_IMAGE}
+
+    </document-image>
+
+
+    <ocr-text-confidence-results>
+
+    {OCR_TEXT_CONFIDENCE}
+
+    </ocr-text-confidence-results>
+
+
+    <final-instructions>
+
+    Analyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure:
+
+      {
+        "attribute_name_1": {
+          "confidence": 0.85,
+          "confidence_reason": "Clear text evidence found in document header with high OCR confidence (0.98). Value matches exactly."
+        },
+        "attribute_name_2": {
+          "confidence": 0.65,
+          "confidence_reason": "Text is partially unclear due to poor scan quality. OCR confidence low (0.72) in this region."
+        }
+      }
+
+    Include assessments for ALL attributes present in the extraction results.
+
+    </final-instructions>
+  temperature: '0.0'
+  model: us.amazon.nova-pro-v1:0
+  system_prompt: >-
+    You are a document analysis assessment expert. Your task is to evaluate the confidence and accuracy of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores and reasoning for each extracted attribute.
 evaluation:
   llm_method:
     top_p: '0.1'
@@ -916,7 +1019,7 @@ evaluation:
         "reason": "Your explanation here"
       }
     temperature: '0.0'
-    model: us.anthropic.claude-3-5-sonnet-20241022-v2:0
+    model: us.anthropic.claude-3-haiku-20240307-v1:0
     system_prompt: >
       You are an evaluator that helps determine if the predicted and expected
       values match for document attribute extraction. You will consider the