aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_library/pattern-1/lending-package-sample/config.yaml‎
Lines changed: 102 additions & 0 deletions b/‎config_library/pattern-1/lending-package-sample/config.yaml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎config_library/pattern-2/bank-statement-sample/config.yaml‎
Lines changed: 104 additions & 0 deletions b/‎config_library/pattern-2/bank-statement-sample/config.yaml‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎config_library/pattern-2/criteria-validation/config.yaml‎
Lines changed: 105 additions & 0 deletions b/‎config_library/pattern-2/criteria-validation/config.yaml‎
Lines changed: 105 additions & 0 deletions
@@ -5,6 +5,38 @@ SPDX-License-Identifier: MIT-0
 
 ## [Unreleased]
 
+### Added
+
+## [0.3.16]
+
+### Added
+
+- **S3 Vectors Support for Cost-Optimized Knowledge Base Storage**
+  - Added S3 Vectors as alternative vector store option to OpenSearch Serverless for Bedrock Knowledge Base with lower storage costs
+  - Custom resource Lambda implementation for S3 vector bucket and index management (using boto3 s3vectors client) with proper IAM permissions and resource cleanup
+  - Unified Knowledge Base interface supporting both vector store types with automatic resource provisioning based on user selection
+
+- **Page Limit Configuration for Classification Control**
+  - Added `maxPagesForClassification` configuration option to control how many pages are used during document classification
+  - **Default Behavior**: `"ALL"` - uses all pages for classification (existing behavior)
+  - **Limited Page Classification**: Set to numeric value (e.g., `"1"`, `"2"`, `"3"`) to classify only the first N pages
+  - **Important**: When using numeric limit, the classification result from the first N pages is applied to ALL pages in the document, effectively forcing the entire document to be assigned a single class with one section
+  - **Use Cases**: Performance optimization for large documents, cost reduction for documents with consistent classification patterns, simplified processing for homogeneous document types
+
+- **CloudFormation Service Role for Delegated Deployment Access**
+  - Added example CloudFormation service role template that enables non-administrator users to deploy and maintain IDP stacks without requiring ongoing administrator permissions
+  - Administrators can provision the service role once with elevated privileges, then delegate deployment capabilities to developer/DevOps teams
+  - Includes comprehensive documentation and cross-referenced deployment guides explaining the security model and setup process
+
+
+### Fixed
+- Fixed issue where CloudFront policy statements were still appearing in generated GovCloud templates despite CloudFront resources being removed
+- Fix duplicate Glue tables are created when using a document class that contains a dash (-). Resolved by replacing dash in section types with underscore character when creating the table, to align with the table name generated later by the Glue crawler - resolves #57.
+- Fix occasional UI error 'Failed to get document details - please try again later' - resolves #58
+- Fixed UI zipfile creation to exclude .aws-sam directories and .env files from deployment package
+- Added security recommendation to set LogLevel parameter to WARN or ERROR (not INFO) for production deployments to prevent logging of sensitive information including PII data, document contents, and S3 presigned URLs
+- Hardened several aspects of the new Discovery feature
+
 ## [0.3.15]
 
 ### Added
 
@@ -1 +1 @@
-0.3.15
+0.3.16
@@ -215,3 +215,105 @@ pricing:
         price: '1.5E-6'
       - name: cacheWriteInputTokens
         price: '1.875E-5'
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'
@@ -68,6 +68,7 @@ classes:
         description: List of all transactions in the statement period
         attributeType: list
 classification:
+  maxPagesForClassification: "ALL"
   image:
     target_height: ''
     target_width: ''
@@ -371,6 +372,7 @@ summarization:
 
 assessment:
   enabled: true
+  validation_enabled: false
   image:
     target_height: ''
     target_width: ''
@@ -691,3 +693,105 @@ pricing:
         price: '1.5E-6'
       - name: cacheWriteInputTokens
         price: '1.875E-5'
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: MIT-0
 
 notes: Criteria validation configuration for healthcare/insurance prior authorization
+assessment:
+  enabled: true
+  validation_enabled: false
 criteria_validation:
   model: us.anthropic.claude-3-5-sonnet-20240620-v1:0
   temperature: 0.0
@@ -209,3 +212,105 @@ pricing:
         price: 0.0000032
       - name: cacheReadInputTokens
         price: 0.0000002
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'