aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 58 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 16 additions & 1 deletion b/‎Makefile‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 10 additions & 4 deletions b/‎README.md‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 108 additions & 1 deletion b/‎config_library/pattern-2/default/config.yaml‎
Lines changed: 108 additions & 1 deletion
diff --git a/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 104 additions & 1 deletion b/‎config_library/pattern-2/few_shot_example_with_multimodal_page_classification/config.yaml‎
Lines changed: 104 additions & 1 deletion
@@ -5,6 +5,64 @@ SPDX-License-Identifier: MIT-0
 
 ## [Unreleased]
 
+## [0.3.3]
+
+### Added
+
+- **Amazon Nova Model Fine-tuning Support**
+  - Added comprehensive `ModelFinetuningService` class for managing Nova model fine-tuning workflows
+  - Support for fine-tuning Amazon Nova models (Nova Lite, Nova Pro) using Amazon Bedrock
+  - Complete end-to-end workflow including dataset preparation, job creation, provisioned throughput management, and inference
+  - CLI tools for fine-tuning workflow:
+    - `prepare_nova_finetuning_data.py` - Dataset preparation from RVL-CDIP or custom datasets
+    - `create_finetuning_job.py` - Fine-tuning job creation with automatic IAM role setup
+    - `create_provisioned_throughput.py` - Provisioned throughput management for fine-tuned models
+    - `inference_example.py` - Model inference and evaluation with comparison capabilities
+  - CloudFormation integration with new parameters:
+    - `CustomClassificationModelARN` - Support for custom fine-tuned classification models in Pattern-2
+    - `CustomExtractionModelARN` - Support for custom fine-tuned extraction models in Pattern-2
+  - Automatic integration of fine-tuned models in classification and extraction model selection dropdowns
+  - Comprehensive documentation in `docs/nova-finetuning.md` with step-by-step instructions
+  - Example notebooks:
+    - `finetuning_dataset_prep.ipynb` - Interactive dataset preparation
+    - `finetuning_model_service_demo.ipynb` - Service usage demonstration
+    - `finetuning_model_document_classification_evaluation.ipynb` - Model evaluation
+  - Built-in support for Bedrock fine-tuning format with multi-modal capabilities
+  - Data splitting and validation set creation
+  - Cost optimization features including provisioned throughput deletion
+  - Performance metrics and accuracy evaluation tools
+
+- **Assessment Feature for Extraction Confidence Evaluation (EXPERIMENTAL)**
+  - Added new assessment service that evaluates extraction confidence using LLMs to analyze extraction results against source documents
+  - Multi-modal assessment capability combining text analysis with document images for comprehensive confidence scoring
+  - UI integration with explainability_info display showing per-attribute confidence scores, thresholds, and explanations
+  - Optional deployment controlled by `IsAssessmentEnabled` parameter (defaults to false)
+  - Added e2e-example-with-assessment.ipynb notebook for testing assessment workflow
+
+- **Enhanced Evaluation Framework with Confidence Integration**
+  - Added confidence fields to evaluation reports for quality analysis
+  - Automatic extraction and display of confidence scores from assessment explainability_info
+  - Enhanced JSON and Markdown evaluation reports with confidence columns
+  - Backward compatible integration - shows "N/A" when confidence data unavailable
+
+- **Evaluation Analytics Database and Reporting System**
+  - Added comprehensive ReportingDatabase (AWS Glue) with structured evaluation metrics storage
+  - Three-tier analytics tables: document_evaluations, section_evaluations, and attribute_evaluations
+  - Automatic partitioning by date and document for efficient querying with Amazon Athena
+  - Detailed metrics tracking including accuracy, precision, recall, F1 score, execution time, and evaluation methods
+  - Added evaluation_reporting_analytics.ipynb notebook for comprehensive performance analysis and visualization
+  - Multi-level analytics with document, section, and attribute-level insights
+  - Visual dashboards showing accuracy distributions, performance trends, and problematic patterns
+  - Configurable filters for date ranges, document types, and evaluation thresholds
+  - Integration with existing evaluation framework - metrics automatically saved to database
+  - ReportingDatabase output added to CloudFormation template for easy reference
+
+### Fixed
+- Fixed build failure related to pandas, numpy, and PyMuPDF dependency conflicts in the idp_common_pkg package
+- Fixed deployment failure caused by CodeBuild project timeout, by raising TimeoutInMinutes property
+- Added missing cached token metrics to CloudWatch dashboards
+- Added Bedrock model access prerequisite to README and deployment doc.
+
 ## [0.3.2]
 
 ### Added
 
@@ -6,6 +6,13 @@ GREEN := \033[0;32m
 YELLOW := \033[1;33m
 NC := \033[0m  # No Color
 
+# Default target - run both lint and test
+all: lint test
+
+# Run tests in idp_common_pkg directory
+test:
+	$(MAKE) -C lib/idp_common_pkg test
+
 # Run both linting and formatting in one command
 lint: ruff-lint format
 
@@ -31,4 +38,12 @@ lint-cicd:
 		echo "$(YELLOW)Please run 'make format' locally to fix these issues.$(NC)"; \
 		exit 1; \
 	fi
-	@echo "$(GREEN)All code quality checks passed!$(NC)"
+	@echo "$(GREEN)All code quality checks passed!$(NC)"
+
+# A convenience Makefile target that runs 
+commit: lint test
+	$(info Generating commit message...)
+	export COMMIT_MESSAGE="$(shell q chat --no-interactive --trust-all-tools "Understand pending local git change and changes to be committed, then infer a commit message. Return this commit message only" | tail -n 1 | sed 's/\x1b\[[0-9;]*m//g')" && \
+	git add . && \
+	git commit -am "$${COMMIT_MESSAGE}" && \
+	git push
@@ -35,6 +35,7 @@ A scalable, serverless solution for automated document processing and informatio
 - **Comprehensive Monitoring**: Rich CloudWatch dashboard with detailed metrics and logs
 - **Web User Interface**: Modern UI for inspecting document workflow status and results
 - **AI-Powered Evaluation**: Framework to assess accuracy against baseline data
+- **Extraction Confidence Assessment**: LLM-powered assessment of extraction confidence with multimodal document analysis
 - **Document Knowledge Base Query**: Ask questions about your processed documents
 
 ## Architecture Overview
@@ -71,8 +72,8 @@ After deployment, you can quickly process a document and view results:
    - **Via S3**: Upload directly to the S3 input bucket (find the bucket URL in CloudFormation stack Outputs)
 
 2. **Use Sample Documents**:
-   - For Pattern 1 (BDA): Use `samples/lending_package.pdf`
-   - For Patterns 2 and 3: Use `samples/rvl_cdip_package.pdf` 
+   - For Pattern 1 (BDA): Use [samples/lending_package.pdf](./samples/lending_package.pdf)
+   - For Patterns 2 and 3: Use [samples/rvl_cdip_package.pdf](./samples/rvl_cdip_package.pdf) 
 
 3. **Monitor Processing**:
    - **Via Web UI**: Track document status on the dashboard
@@ -84,6 +85,10 @@ After deployment, you can quickly process a document and view results:
 
 See the [Deployment Guide](./docs/deployment.md#testing-the-solution) for more detailed testing instructions.
 
+IMPORTANT: If you have not previously done so, you must [request access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) to the following Amazon Bedrock models:
+- Amazon: All Nova models, plus Titan Text Embeddings V2
+- Anthropic: Claude 3.x models, Claude 4.x models
+
 ## Updating an Existing Deployment
 
 To update an existing GenAIIDP stack to a new version:
@@ -110,11 +115,12 @@ For detailed deployment and testing instructions, see the [Deployment Guide](./d
 - [Architecture](./docs/architecture.md) - Detailed component architecture and data flow
 - [Deployment](./docs/deployment.md) - Build, publish, deploy, and test instructions
 - [Web UI](./docs/web-ui.md) - Web interface features and usage
-- [Knowledge Base](./docs/knowledge-base.md) - Document knowledge base query feature
-- [Evaluation Framework](./docs/evaluation.md) - Accuracy assessment system
 - [Configuration](./docs/configuration.md) - Configuration and customization options
 - [Classification](./docs/classification.md) - Customizing document classification
 - [Extraction](./docs/extraction.md) - Customizing information extraction
+- [Assessment](./docs/assessment.md) - Extraction confidence evaluation using LLMs
+- [Evaluation Framework](./docs/evaluation.md) - Accuracy assessment system with analytics database and reporting
+- [Knowledge Base](./docs/knowledge-base.md) - Document knowledge base query feature
 - [Monitoring](./docs/monitoring.md) - Monitoring and logging capabilities
 - [Troubleshooting](./docs/troubleshooting.md) - Troubleshooting and performance guides
 
 
@@ -1 +1 @@
-0.3.2
+0.3.3
@@ -13,10 +13,13 @@ classes:
     attributes:
       - name: sender_name
         description: The name of the person or entity who wrote or sent the letter. Look for text following or near terms like 'from', 'sender', 'authored by', 'written by', or at the end of the letter before a signature.
+        confidence_threshold: '0.85'
       - name: sender_address
         description: The physical address of the sender, typically appearing at the top of the letter. May be labeled as 'address', 'location', or 'from address'.
+        confidence_threshold: '0.8'
       - name: recipient_name
         description: The name of the person or entity receiving the letter. Look for this after 'to', 'recipient', 'addressee', or at the beginning of the letter.
+        confidence_threshold: '0.9'
       - name: recipient_address
         description: The physical address where the letter is to be delivered. Often labeled as 'to address' or 'delivery address', typically appearing below the recipient name.
       - name: date
@@ -587,6 +590,110 @@ summarization:
   model: us.anthropic.claude-3-7-sonnet-20250219-v1:0
   system_prompt: >-
     You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
+assessment:
+  default_confidence_threshold: '0.9'
+  top_p: '0.1'
+  max_tokens: '4096'
+  top_k: '5'
+  task_prompt: >-
+    <background>
+
+    You are an expert document analysis assessment system. Your task is to evaluate the confidence and accuracy of extraction results for a document of class {DOCUMENT_CLASS}.
+
+    </background>
+
+
+    <task>
+
+    Analyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:
+
+    1. Text clarity and OCR quality in the source regions
+    2. Alignment between extracted values and document content
+    3. Presence of clear evidence supporting the extraction
+    4. Potential ambiguity or uncertainty in the source material
+    5. Completeness and accuracy of the extracted information
+
+    </task>
+
+
+    <assessment-guidelines>
+
+    For each attribute, provide:
+    1. A confidence score between 0.0 and 1.0 where:
+       - 1.0 = Very high confidence, clear and unambiguous evidence
+       - 0.8-0.9 = High confidence, strong evidence with minor uncertainty
+       - 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity
+       - 0.4-0.5 = Low confidence, weak or unclear evidence
+       - 0.0-0.3 = Very low confidence, little to no supporting evidence
+
+    2. A clear reason explaining the confidence score, including:
+       - What evidence supports or contradicts the extraction
+       - Any OCR quality issues that affect confidence
+       - Clarity of the source document in relevant areas
+       - Any ambiguity or uncertainty factors
+
+    Guidelines:
+    - Base assessments on actual document content and OCR quality
+    - Consider both text-based evidence and visual/layout clues
+    - Account for OCR confidence scores when provided
+    - Be objective and specific in reasoning
+    - If an extraction appears incorrect, score accordingly with explanation
+
+    </assessment-guidelines>
+
+    <attributes-definitions>
+
+    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
+    </attributes-definitions>
+
+
+    <<CACHEPOINT>>
+
+
+    <extraction-results>
+
+    {EXTRACTION_RESULTS}
+
+    </extraction-results>
+
+
+    <document-image>
+
+    {DOCUMENT_IMAGE}
+
+    </document-image>
+
+
+    <ocr-text-confidence-results>
+
+    {OCR_TEXT_CONFIDENCE}
+
+    </ocr-text-confidence-results>
+
+
+    <final-instructions>
+
+    Analyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure:
+
+      {
+        "attribute_name_1": {
+          "confidence": 0.85,
+          "confidence_reason": "Clear text evidence found in document header with high OCR confidence (0.98). Value matches exactly."
+        },
+        "attribute_name_2": {
+          "confidence": 0.65,
+          "confidence_reason": "Text is partially unclear due to poor scan quality. OCR confidence low (0.72) in this region."
+        }
+      }
+
+    Include assessments for ALL attributes present in the extraction results.
+
+    </final-instructions>
+  temperature: '0.0'
+  model: us.amazon.nova-pro-v1:0
+  system_prompt: >-
+    You are a document analysis assessment expert. Your task is to evaluate the confidence and accuracy of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores and reasoning for each extracted attribute.
 evaluation:
   llm_method:
     top_p: '0.1'
@@ -622,7 +729,7 @@ evaluation:
         "reason": "Your explanation here"
       }
     temperature: '0.0'
-    model: us.anthropic.claude-3-5-sonnet-20241022-v2:0
+    model: us.anthropic.claude-3-haiku-20240307-v1:0
     system_prompt: >-
       You are an evaluator that helps determine if the predicted and expected values match for document attribute extraction. You will consider the context and meaning rather than just exact string matching.
 pricing:
 
@@ -877,6 +877,109 @@ pricing:
         price: '3.0E-7'
       - name: cacheWriteInputTokens
         price: '3.75E-6'
+assessment:
+  top_p: '0.1'
+  max_tokens: '4096'
+  top_k: '5'
+  task_prompt: >-
+    <background>
+
+    You are an expert document analysis assessment system. Your task is to evaluate the confidence and accuracy of extraction results for a document of class {DOCUMENT_CLASS}.
+
+    </background>
+
+
+    <task>
+
+    Analyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:
+
+    1. Text clarity and OCR quality in the source regions
+    2. Alignment between extracted values and document content
+    3. Presence of clear evidence supporting the extraction
+    4. Potential ambiguity or uncertainty in the source material
+    5. Completeness and accuracy of the extracted information
+
+    </task>
+
+
+    <assessment-guidelines>
+
+    For each attribute, provide:
+    1. A confidence score between 0.0 and 1.0 where:
+       - 1.0 = Very high confidence, clear and unambiguous evidence
+       - 0.8-0.9 = High confidence, strong evidence with minor uncertainty
+       - 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity
+       - 0.4-0.5 = Low confidence, weak or unclear evidence
+       - 0.0-0.3 = Very low confidence, little to no supporting evidence
+
+    2. A clear reason explaining the confidence score, including:
+       - What evidence supports or contradicts the extraction
+       - Any OCR quality issues that affect confidence
+       - Clarity of the source document in relevant areas
+       - Any ambiguity or uncertainty factors
+
+    Guidelines:
+    - Base assessments on actual document content and OCR quality
+    - Consider both text-based evidence and visual/layout clues
+    - Account for OCR confidence scores when provided
+    - Be objective and specific in reasoning
+    - If an extraction appears incorrect, score accordingly with explanation
+
+    </assessment-guidelines>
+
+    <attributes-definitions>
+
+    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
+    </attributes-definitions>
+
+
+    <<CACHEPOINT>>
+
+
+    <extraction-results>
+
+    {EXTRACTION_RESULTS}
+
+    </extraction-results>
+
+
+    <document-image>
+
+    {DOCUMENT_IMAGE}
+
+    </document-image>
+
+
+    <ocr-text-confidence-results>
+
+    {OCR_TEXT_CONFIDENCE}
+
+    </ocr-text-confidence-results>
+
+
+    <final-instructions>
+
+    Analyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure:
+
+      {
+        "attribute_name_1": {
+          "confidence": 0.85,
+          "confidence_reason": "Clear text evidence found in document header with high OCR confidence (0.98). Value matches exactly."
+        },
+        "attribute_name_2": {
+          "confidence": 0.65,
+          "confidence_reason": "Text is partially unclear due to poor scan quality. OCR confidence low (0.72) in this region."
+        }
+      }
+
+    Include assessments for ALL attributes present in the extraction results.
+
+    </final-instructions>
+  temperature: '0.0'
+  model: us.amazon.nova-pro-v1:0
+  system_prompt: >-
+    You are a document analysis assessment expert. Your task is to evaluate the confidence and accuracy of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores and reasoning for each extracted attribute.
 evaluation:
   llm_method:
     top_p: '0.1'
@@ -916,7 +1019,7 @@ evaluation:
         "reason": "Your explanation here"
       }
     temperature: '0.0'
-    model: us.anthropic.claude-3-5-sonnet-20241022-v2:0
+    model: us.anthropic.claude-3-haiku-20240307-v1:0
     system_prompt: >
       You are an evaluator that helps determine if the predicted and expected
       values match for document attribute extraction. You will consider the