From 879f3359befae10e2cf0d707904bcc551403fb5a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 28 Nov 2025 18:49:20 +0100
Subject: [PATCH 01/39] chore: fix missing substitution for
 custom_prompt_lambda_arn

---
 patterns/pattern-2/template.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml
index 6605c8af..2bd9c364 100644
--- a/patterns/pattern-2/template.yaml
+++ b/patterns/pattern-2/template.yaml
@@ -1026,7 +1026,7 @@ Resources:
                 order: 7
               custom_prompt_lambda_arn:
                 type: string
-                description: "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor"
+                description: !Sub "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor"
                 order: 8
           assessment:
             order: 5

From 335f87b004a56f54c67c9d59e36526b012a65fba Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 28 Nov 2025 21:29:00 +0100
Subject: [PATCH 02/39] feat: dynamic-few shot Lambda using S3 Vectors

---
 .../GENAIIDP-dynamic-few-shot.py              | 257 +++++++++++++
 .../dynamic-few-shot-lambda/README.md         | 364 ++++++++++++++++++
 .../dynamic-few-shot-lambda/requirements.txt  |   1 +
 .../dynamic-few-shot-lambda/samconfig.toml    |  11 +
 .../dynamic-few-shot-lambda/template.yml      | 204 ++++++++++
 5 files changed, 837 insertions(+)
 create mode 100644 notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
 create mode 100644 notebooks/examples/dynamic-few-shot-lambda/README.md
 create mode 100644 notebooks/examples/dynamic-few-shot-lambda/requirements.txt
 create mode 100644 notebooks/examples/dynamic-few-shot-lambda/samconfig.toml
 create mode 100644 notebooks/examples/dynamic-few-shot-lambda/template.yml

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
new file mode 100644
index 00000000..b2c6272d
--- /dev/null
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -0,0 +1,257 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+"""
+Lambda function to provide examples with ground truth data based on S3 Vectors lookup.
+
+Key Features Demonstrated:
+- Dynamically retrieve similar examples based on document content using vector similarity search
+- Provide few-shot examples to improve extraction accuracy through example-based prompting
+- Leverage S3 Vectors for efficient similarity search across large example datasets
+- Integrate multimodal embeddings using Amazon Nova models for image-based similarity
+- Customize example selection based on document characteristics and business rules
+"""
+
+import json
+import logging
+import base64
+import boto3
+import os
+
+from idp_common import bedrock, s3
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Parse environment variables with error handling
+try:
+    S3VECTOR_BUCKET = os.environ['S3VECTOR_BUCKET']
+    S3VECTOR_INDEX = os.environ['S3VECTOR_INDEX']
+    S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS'])
+    MODEL_ID = os.environ['MODEL_ID']
+    TOP_K = int(os.environ['TOP_K'])
+except (KeyError, ValueError, IndexError) as e:
+    logger.error(f"Failed to parse environment variables: {e}")
+    raise
+
+# Initialize clients
+s3vectors = boto3.client('s3vectors')
+bedrock_client = bedrock.BedrockClient()
+
+def lambda_handler(event, context):
+    """
+    Process a document to find similar examples using S3 Vectors similarity search.
+
+    Input event:
+    {
+        "class_label": "<class_label>",
+        "document_texts": ["<document_text_1>", "<document_text_2>", ...],
+        "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
+    }
+
+    Return format:
+    [
+        {
+            "attributes_prompt": "expected attributes are: ...",
+            "class_prompt": "This is an example of the class 'invoice'",
+            "distance": 0.892344521145,
+            "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
+        }
+    ]
+    """
+    
+    try:
+        logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===")
+        logger.debug(f"Complete input event: {json.dumps(event, indent=2)}")
+        
+        # Validate input
+        class_label = event.get("class_label")
+        document_texts = event.get("document_texts", [])
+        image_content = event.get("image_content", [])
+
+        logger.info(f"=== INPUT VALUES ===")
+        logger.info(f"Class label: {class_label if class_label else 'Not specified'}")
+        logger.info(f"Document texts: {len(document_texts)}")
+        logger.info(f"Image content: {len(image_content)}")
+
+        # Decode input data
+        image_data = _decode_images(image_content)
+
+        # Find similar items using S3 vectors lookup from image similarity
+        result = _s3vectors_find_similar_items(image_data)
+
+        # Log complete output structure
+        logger.info(f"=== OUTPUT ANALYSIS ===")
+        logger.debug(f"Complete result: {json.dumps(result, indent=2)}")
+        logger.info(f"Output items: {len(result)}")
+
+        logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===")
+        return result
+            
+    except Exception as e:
+        logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===")
+        logger.error(f"Error type: {type(e).__name__}")
+        logger.error(f"Error message: {str(e)}")
+        logger.error(f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}")
+        # In demo, we'll fail gracefully with detailed error info
+        raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}")
+
+def _decode_images(image_content):
+    """Base64 decode image content to bytes"""
+    result = []
+    for image_base64 in image_content:
+        image_data = base64.b64decode(image_base64)
+        result.append(image_data)
+    return result
+
+def _encode_images(image_content):
+    """Base64 encode image content to JSON-serializable string"""
+    result = []
+    for image_bytes in image_content:
+       image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+       result.append(image_base64)
+    return result
+
+def _s3vectors_find_similar_items(image_data):
+    """Find similar items for input"""
+
+    # find similar items based on image similarity only
+    similar_items = {}
+    for page_image in image_data:
+        result = _s3vectors_find_similar_items_from_image(image_data)
+        _merge_examples(similar_items, result)
+
+    # create result set
+    result = []
+    for key, example in similar_items.items():
+        metadata = example.get("metadata", {})
+        attributes_prompt = metadata.get("attributesPrompt")
+
+        # Only process this example if it has a non-empty attributesPrompt
+        if not attributes_prompt or not attributes_prompt.strip():
+            logger.info(
+                f"Skipping example with empty attributesPrompt: {key}"
+            )
+            continue
+
+        attributes = _extract_metadata(metadata)
+        result.append(attributes)
+
+    return result
+
+def _s3vectors_find_similar_items_from_image(page_image):
+    """Search for similar items using image query"""
+    embedding = bedrock_client.generate_embedding(
+        image_source=page_image,
+        model_id=MODEL_ID,
+        dimensions=S3VECTOR_DIMENSIONS,
+    )
+    response = s3vectors.query_vectors(
+        vectorBucketName=S3VECTOR_BUCKET,
+        indexName=S3VECTOR_INDEX,
+        queryVector={"float32": embedding},
+        topK=TOP_K,
+        returnDistance=True,
+        returnMetadata=True
+    )
+    return response["vectors"]
+
+def _merge_examples(examples, new_examples):
+    """
+    Merge in-place new examples into the result list, avoiding duplicates.
+
+    Args:
+        examples: Dict of existing examples
+        new_examples: List of new examples to be merged
+    """
+    for new_example in new_examples:
+        key = new_example["key"]
+        new_distance = new_example.get("distance", 1.0)
+        
+        # update example
+        if combined_examples.get(key):
+            existing_distance = combined_examples[key].get("distance", 1.0)
+            examples[key]["distance"] = min(new_distance, existing_distance)
+            examples[key]["metadata"] = new_example.get("metadata")
+        # insert example
+        else:
+            examples[key] = {
+                "distance": new_distance,
+                "metadata": new_example.get("metadata")
+            }
+
+def _extract_metadata(metadata, distance):
+    """Create result object from S3 vectors metadata"""
+    # Result object attributes
+    attributes = {
+        "attributes_prompt": metadata.get("attributesPrompt"),
+        "class_prompt": metadata.get("classPrompt"),
+        "distance": distance,
+    }
+
+    image_path = metadata.get("imagePath")
+    if image_path:
+        image_data = _get_image_data_from_s3_path(image_path)
+        encoded_images = _encode_images(image_data)
+        attributes["image_content"] = encoded_images
+    
+    return attributes
+
+def _get_image_data_from_s3_path(image_path):
+    """
+    Load images from image path
+
+    Args:
+        image_path: Path to image file, directory, or S3 prefix
+
+    Returns:
+        List of images (bytes)
+    """
+    # Get list of image files from the path (supports directories/prefixes)
+    image_files = _get_image_files_from_s3_path(image_path)
+    image_content = []
+
+    # Process each image file
+    for image_file_path in image_files:
+        try:
+            # Load image content
+            if image_file_path.startswith("s3://"):
+                # Direct S3 URI
+                image_bytes = s3.get_binary_content(image_file_path)
+            else:
+                raise ValueError(
+                    f"Invalid file path {image_path} - expecting S3 path"
+                )
+
+            image_content.append(image_bytes)
+        except Exception as e:
+            logger.warning(f"Failed to load image {image_file_path}: {e}")
+            continue
+    
+    return image_content
+
+def _get_image_files_from_s3_path(image_path):
+    """
+    Get list of image files from an S3 path.
+
+    Args:
+        image_path: Path to image file, directory, or S3 prefix
+
+    Returns:
+        List of image file paths/URIs sorted by filename
+    """
+    # Handle S3 URIs
+    if not image_path.startswith("s3://"):
+        raise ValueError(
+            f"Invalid file path {image_path} - expecting S3 URI"
+        )
+
+    # Check if it's a direct file or a prefix
+    if image_path.endswith(
+        (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")
+    ):
+        # Direct S3 file
+        return [image_path]
+    else:
+        # S3 prefix - list all images
+        return s3.list_images_from_path(image_path)
diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md
new file mode 100644
index 00000000..a6f4219e
--- /dev/null
+++ b/notebooks/examples/dynamic-few-shot-lambda/README.md
@@ -0,0 +1,364 @@
+# Dynamic-Few Shot Prompting - Complete Guide
+
+This directory contains the **complete implementation and demonstration** of the dynamic-few shot prompting feature for GenAI IDP Accelerator. This feature enables users to dynamically retrieve few-shot examples using S3 Vectors similarity search to improve extraction accuracy for Pattern 2.
+
+## 🎯 Overview
+
+The dynamic-few shot prompting feature allows you to:
+
+- **Dynamically retrieve similar examples** based on document content using vector similarity search
+- **Provide few-shot examples** to improve extraction accuracy through example-based prompting
+- **Leverage S3 Vectors** for efficient similarity search across large example datasets
+- **Integrate multimodal embeddings** using Amazon Nova models for image-based similarity
+- **Customize example selection** based on document characteristics and business rules
+
+## 📁 Files in This Directory
+
+- **`GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup
+- **`template.yml`** - CloudFormation SAM template to deploy the complete stack
+- **`requirements.txt`** - Python dependencies for the Lambda function
+- **`README.md`** - This comprehensive documentation and guide
+
+## 🏗️ Architecture
+
+```mermaid
+flowchart TD
+    A[Document Processing] --> B{Dynamic-few shot configured?}
+    B -->|No| C[Use Default Extraction]
+    B -->|Yes| D[Invoke Dynamic-few shot Lambda]
+    
+    subgraph Lambda
+        D --> E[Receive Document Images]
+        E --> F[Generate Embeddings with Nova]
+        F --> G[Query S3 Vectors Index]
+        G --> H[Retrieve Similar Examples]
+        H --> I[Load Example Images from S3]
+        I --> J[Format Examples for Bedrock]
+    end
+    
+    J --> K[Use Examples in Extraction Prompt]
+    C --> L[Continue with Standard Extraction]
+    K --> L
+    
+    subgraph Input
+        M[Document Class]
+        N[Document Text]
+        O[Document Images]
+    end
+    
+    subgraph Output
+        P[Example Attributes Prompts]
+        Q[Example Images]
+        R[Similarity Distances]
+    end
+    
+    D -.-> M
+    D -.-> N
+    D -.-> O
+    
+    J -.-> P
+    J -.-> Q
+    J -.-> R
+```
+
+## Quick Start
+
+### Step 1: Deploy the Dynamic-few shot Stack
+
+```bash
+# Navigate to the dynamic-few-shot-lambda directory
+cd notebooks/examples/dynamic-few-shot-lambda
+
+# Deploy using AWS SAM
+sam deploy --guided
+```
+
+### Step 2: Get the Lambda ARN
+
+After deployment, get the ARN from CloudFormation outputs:
+
+```bash
+aws cloudformation describe-stacks \
+    --stack-name GENAIIDP-dynamic-few-shot-stack \
+    --query 'Stacks[0].Outputs[?OutputKey==`DynamicFewShotFunctionArn`].OutputValue' \
+    --output text
+```
+
+### Step 3: Populate the Examples Dataset
+
+Use the [fewshot_dataset_import.ipynb](../../misc/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack.
+
+### Step 4: Configure IDP to Use Dynamic-few shot
+
+Add the Lambda ARN to your IDP extraction configuration:
+
+```yaml
+extraction:
+  dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
+```
+
+## Lambda Interface
+
+### Input Payload Structure
+```json
+{
+  "class_label": "invoice",
+  "document_texts": [
+    "Invoice text or markdown from page 1...",
+    "Invoice text or markdown from page 2..."
+  ],
+  "image_content": [
+    "base64_encoded_image_1",
+    "base64_encoded_image_2"
+  ]
+}
+```
+
+### Output Payload Structure
+```json
+[
+  {
+    "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...",
+    "class_prompt": "This is an example of the class 'invoice'",
+    "distance": 0.892344521145,
+    "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
+  }
+]
+```
+
+## Core Functionality
+
+### 1. Vector Similarity Search
+
+The Lambda uses Amazon Nova multimodal embeddings to find similar examples:
+
+```python
+# Generate embedding from document image
+embedding = bedrock.generate_embedding(
+    image_source=image_data,
+    model_id=MODEL_ID,
+    dimensions=S3VECTOR_DIMENSIONS,
+)
+
+# Query S3 Vectors for similar examples
+response = s3vectors.query_vectors(
+    vectorBucketName=S3VECTOR_BUCKET,
+    indexName=S3VECTOR_INDEX,
+    queryVector={"float32": embedding},
+    topK=TOP_K,
+    returnDistance=True,
+    returnMetadata=True
+)
+```
+
+### 2. Example Merging and Deduplication
+
+Multiple document images are processed and results are merged to avoid duplicates:
+
+```python
+def merge_examples(combined_examples, new_examples):
+    """Merge examples, keeping the best similarity score for duplicates"""
+    for new_example in new_examples:
+        key = new_example["key"]
+        if combined_examples.get(key):
+            # Keep the better (lower) distance score
+            combined_examples[key]["distance"] = min(
+                new_example.get("distance"), 
+                combined_examples[key]["distance"]
+            )
+```
+
+### 3. Example Image Loading
+
+The Lambda loads example images from S3 paths stored in vector metadata:
+
+```python
+def get_image_files_from_s3_path(image_path: str) -> List[str]:
+    """Get list of image files from S3 path or prefix"""
+    if image_path.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")):
+        return [image_path]  # Direct file
+    else:
+        return s3.list_images_from_path(image_path)  # Directory/prefix
+```
+
+## Configuration
+
+### Environment Variables
+
+The Lambda function uses these environment variables (set by the CloudFormation template):
+
+- `S3VECTOR_BUCKET` - Name of the S3 Vectors bucket
+- `S3VECTOR_INDEX` - Name of the S3 Vectors index
+- `S3VECTOR_DIMENSIONS` - Embedding dimensions (e.g. `3072` for Nova Multimodal Embedding model)
+- `MODEL_ID` - Bedrock model ID for embeddings (e.g. `amazon.nova-2-multimodal-embeddings-v1:0`)
+- `TOP_K` - Number of similar examples to retrieve
+
+### S3 Vectors Configuration
+
+The stack creates:
+- **Vector Bucket**: Encrypted S3 bucket for vector storage
+- **Vector Index**: Cosine similarity index with 3072 dimensions
+- **Metadata Configuration**: Stores `classPrompt`, `attributesPrompt`, and `imagePath` as non-filterable metadata keys
+
+## Monitoring and Troubleshooting
+
+### CloudWatch Logs
+
+Monitor the Lambda function logs:
+- `/aws/lambda/GENAIIDP-dynamic-few-shot` - Dynamic few-shot Lambda logs
+
+### Key Log Messages
+
+**Successful Operation:**
+```
+Processing document ID: document-123
+Document class: invoice
+Response contains 2 elements
+```
+
+**Error Conditions:**
+```
+No class_label found in event
+No document_texts found in event or not in list format
+Failed to load example images from s3://bucket/path: error
+```
+
+### Performance Monitoring
+
+Key metrics to monitor:
+- **Lambda Duration**: Time to retrieve and process examples
+- **S3 Vectors Query Time**: Vector similarity search performance
+- **Example Count**: Number of examples returned per request
+- **Error Rate**: Failed example retrievals
+
+## Example Dataset Structure
+
+### Vector Metadata Format
+
+Each vector in the S3 Vectors index should have metadata:
+
+```json
+{
+  "classLabel": "invoice",
+  "classPrompt": "This is an example of the class 'invoice'",
+  "attributesPrompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...",
+  "imagePath": "s3://examples-bucket/invoices/example-001/"
+}
+```
+
+### Image Storage Structure
+
+Example images should be stored in S3 with paths referenced in metadata:
+
+```
+s3://examples-bucket/
+├── invoices/
+│   ├── example-001/
+│   │   ├── page-1.jpg
+│   │   └── page-2.jpg
+│   └── example-002/
+│       └── invoice.png
+└── receipts/
+    ├── example-003/
+    │   └── receipt.jpg
+    └── example-004/
+        └── receipt.png
+```
+
+## Production Considerations
+
+### 1. Example Dataset Management
+
+- **Quality Control**: Ensure high-quality, representative examples
+- **Regular Updates**: Keep examples current with document variations
+- **Metadata Consistency**: Maintain consistent attribute descriptions
+- **Image Optimization**: Use appropriate image formats and sizes
+
+### 2. Performance Optimization
+
+```python
+# Cache frequently accessed examples
+# Optimize vector dimensions for your use case
+# Use appropriate TOP_K values (typically 2-5)
+# Consider batch processing for multiple documents
+```
+
+### 3. Security Considerations
+
+- **Access Control**: Restrict access to example datasets
+- **Data Privacy**: Ensure examples don't contain sensitive information
+- **Encryption**: Use appropriate encryption for stored examples
+- **Audit Logging**: Log example usage for compliance
+
+### 4. Cost Optimization
+
+- **Vector Index Size**: Monitor storage costs for large example sets
+- **Embedding Generation**: Optimize frequency of embedding updates
+- **Lambda Memory**: Right-size memory allocation based on usage
+- **S3 Storage Classes**: Use appropriate storage classes for examples
+
+## Deployment Options
+
+### Option 1: AWS SAM (Recommended)
+```bash
+sam build
+sam deploy --guided
+```
+
+### Option 2: AWS CLI
+```bash
+# Package and deploy
+aws cloudformation package \
+    --template-file template.yml \
+    --s3-bucket your-deployment-bucket \
+    --output-template-file packaged-template.yml
+
+aws cloudformation deploy \
+    --template-file packaged-template.yml \
+    --stack-name GENAIIDP-dynamic-few-shot-stack \
+    --capabilities CAPABILITY_IAM
+```
+
+## Cleanup
+
+To remove the dynamic-few shot resources:
+
+```bash
+# Delete the CloudFormation stack
+aws cloudformation delete-stack --stack-name GENAIIDP-dynamic-few-shot-stack
+
+# Note: S3 buckets with retention policy will be retained
+```
+
+## Integration with IDP
+
+### Configuration in IDP Stack
+
+Add the dynamic-few shot Lambda ARN to your IDP configuration:
+
+```yaml
+# In your IDP stack parameters or configuration
+extraction:
+  dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
+```
+
+### Expected Behavior
+
+When configured:
+1. IDP processes document and extracts images/text
+2. Dynamic few-shot Lambda is invoked with document data
+3. Lambda returns similar examples with prompts and images
+4. IDP includes examples in extraction prompt to Bedrock
+5. Bedrock uses examples to improve extraction accuracy
+
+## Next Steps
+
+After deploying the dynamic-few shot:
+
+1. **Populate example dataset** with representative documents
+2. **Test similarity search** with sample documents
+3. **Monitor performance** and adjust TOP_K as needed
+4. **Integrate with IDP** using the Lambda ARN
+5. **Evaluate accuracy improvements** with few-shot examples
+
+The dynamic-few shot enables powerful few-shot learning capabilities while leveraging efficient vector similarity search for dynamic example selection.
\ No newline at end of file
diff --git a/notebooks/examples/dynamic-few-shot-lambda/requirements.txt b/notebooks/examples/dynamic-few-shot-lambda/requirements.txt
new file mode 100644
index 00000000..2048c02c
--- /dev/null
+++ b/notebooks/examples/dynamic-few-shot-lambda/requirements.txt
@@ -0,0 +1 @@
+../../../lib/idp_common_pkg[extraction,docs_service]  # extraction module and document service with dependencies
diff --git a/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml b/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml
new file mode 100644
index 00000000..e25430e5
--- /dev/null
+++ b/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml
@@ -0,0 +1,11 @@
+version = 0.1
+
+[default.deploy.parameters]
+stack_name = "GENAIIDP-dynamic-few-shot-stack"
+resolve_s3 = true
+s3_prefix = "GENAIIDP-dynamic-few-shot-stack"
+region = "us-east-1"
+capabilities = "CAPABILITY_IAM"
+disable_rollback = true
+parameter_overrides = "PermissionsBoundaryArn=\"\" VectorBucketName=\"genaiidp-dynamic-few-shot\" VectorIndexName=\"documents\" VectorDimensions=\"3072\" ModelId=\"amazon.nova-2-multimodal-embeddings-v1:0\" TopK=\"2\" LambdaFunctionName=\"GENAIIDP-dynamic-few-shot\""
+image_repositories = []
diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/notebooks/examples/dynamic-few-shot-lambda/template.yml
new file mode 100644
index 00000000..927c9a65
--- /dev/null
+++ b/notebooks/examples/dynamic-few-shot-lambda/template.yml
@@ -0,0 +1,204 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+AWSTemplateFormatVersion: '2010-09-09'
+Transform: AWS::Serverless-2016-10-31
+Description: Deploy demo Lambda function for GenAI IDP dynamic few-shot prompting
+
+Parameters:
+
+  PermissionsBoundaryArn:
+    Type: String
+    Default: ""
+    Description: >-
+      (Optional) ARN of an existing IAM Permissions Boundary policy to attach to the Lambda execution role.
+      Leave blank if no Permissions Boundary is required.
+    AllowedPattern: "^(|arn:aws[a-z-]*::iam::[0-9]{12}:policy/.+)$"
+    ConstraintDescription: Must be empty or a valid IAM policy ARN
+
+  VectorBucketName:
+    Type: String
+    Default: "genaiidp-dynamic-few-shot"
+
+  VectorIndexName:
+    Type: String
+    Default: "documents"
+
+  VectorDimensions:
+    Type: Number
+    Default: 3072
+
+  ModelId:
+    Type: String
+    Default: "amazon.nova-2-multimodal-embeddings-v1:0"
+
+  TopK:
+    Type: Number
+    Default: 2
+
+  LambdaFunctionName:
+    Type: String
+    Default: "GENAIIDP-dynamic-few-shot"
+
+Conditions:
+  HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]]
+
+Resources:
+
+  DynamicFewShotFunction:
+    Type: AWS::Serverless::Function
+    Metadata:
+      cfn_nag:
+        rules_to_suppress:
+          - id: W89
+            reason: "Demo function - does not require VPC access"
+          - id: W92
+            reason: "Demo function - does not require reserved concurrency as it scales based on demand"
+          - id: W58
+            reason: "Demo function - DLQ not required"
+    # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries"
+    # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs"
+    # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand"
+    # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings"
+    Properties:
+      FunctionName: !Ref LambdaFunctionName
+      PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue]
+      CodeUri: ./
+      Handler: GENAIIDP-dynamic-few-shot.lambda_handler
+      Runtime: python3.12
+      Architectures:
+        - arm64
+      Timeout: 300
+      MemorySize: 512
+      Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting
+      Environment:
+        Variables:
+          LOG_LEVEL: INFO
+          S3VECTOR_BUCKET: !Ref VectorBucketName
+          S3VECTOR_INDEX: !Ref VectorIndexName
+          S3VECTOR_DIMENSIONS: !Ref VectorDimensions
+          MODEL_ID: !Ref ModelId
+          TOP_K: !Ref TopK
+      LoggingConfig:
+        LogGroup: !Ref DynamicFewShotLogGroup
+      # Minimal permissions - only needs basic execution and logging
+      Policies:
+        - AWSLambdaBasicExecutionRole
+        - S3ReadPolicy:
+            BucketName: !Ref DynamicFewShotDatasetBucket
+        - Statement:
+            - Effect: Allow
+              Action: cloudwatch:PutMetricData
+              Resource: "*"
+            - Effect: Allow
+              Action:
+                - bedrock:InvokeModel
+                - bedrock:InvokeModelWithResponseStream
+              Resource:
+                - !Sub "arn:${AWS::Partition}:bedrock:*::foundation-model/*"
+                - !Sub "arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:inference-profile/*"
+            - Effect: Allow
+              Action:
+                - s3vectors:GetVectors
+                - s3vectors:QueryVectors
+              Resource:
+                - !Ref DynamicFewShotVectorIndex
+
+  DynamicFewShotLogGroup:
+    Type: AWS::Logs::LogGroup
+    Metadata:
+      cfn_nag:
+        rules_to_suppress:
+          - id: W84
+            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    Properties:
+      LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}"
+      RetentionInDays: 7  # Short retention for demo purposes
+
+  DynamicFewShotVectorBucket:
+    Type: AWS::S3Vectors::VectorBucket
+    Metadata:
+      cfn_nag:
+        rules_to_suppress:
+          - id: W84
+            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    Properties:
+      VectorBucketName: !Ref VectorBucketName
+      EncryptionConfiguration:
+        SseType: "AES256"
+
+  DynamicFewShotVectorIndex:
+    Type: AWS::S3Vectors::Index
+    Properties:
+      IndexName: !Ref VectorIndexName
+      DataType: "float32"
+      Dimension: !Ref VectorDimensions
+      DistanceMetric: "cosine"
+      MetadataConfiguration:
+        NonFilterableMetadataKeys:
+          - "classPrompt"
+          - "attributesPrompt"
+          - "imagePath"
+      VectorBucketArn: !Ref DynamicFewShotVectorBucket
+
+  DynamicFewShotDatasetBucket:
+    Type: AWS::S3::Bucket
+    DeletionPolicy: RetainExceptOnCreate
+    Metadata:
+      cfn_nag:
+        rules_to_suppress:
+          - id: W84
+            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    Properties:
+      BucketEncryption:
+        ServerSideEncryptionConfiguration:
+          - ServerSideEncryptionByDefault:
+              SSEAlgorithm: "AES256"
+      PublicAccessBlockConfiguration:
+        BlockPublicAcls: true
+        BlockPublicPolicy: true
+        IgnorePublicAcls: true
+        RestrictPublicBuckets: true
+      VersioningConfiguration:
+        Status: Enabled
+
+Outputs:
+
+  DynamicFewShotFunctionName:
+    Description: Name of the demo Lambda function
+    Value: !Ref DynamicFewShotFunction
+
+  DynamicFewShotFunctionArn:
+    Description: ARN of the demo Lambda function (use this in your GenAIIDP configuration)
+    Value: !GetAtt DynamicFewShotFunction.Arn
+
+  DynamicFewShotLogGroup:
+    Description: CloudWatch Log Group for monitoring demo Lambda execution
+    Value: !Ref DynamicFewShotLogGroup
+
+  DynamicFewShotVectorBucketArn:
+    Description: S3 Vectors bucket for dynamic few-shot examples
+    Value: !Ref DynamicFewShotVectorBucket
+
+  DynamicFewShotVectorIndexArn:
+    Description: S3 Vectors index for dynamic few-shot examples
+    Value: !Ref DynamicFewShotVectorIndex
+
+  DynamicFewShotDatasetBucket:
+    Description: S3 Bucket for example data sets
+    Value: !Ref DynamicFewShotDatasetBucket
+
+  UsageInstructions:
+    Description: How to use this Lambda in your IDP configuration
+    Value: !Sub |
+      Add this ARN to your extraction config:
+      extraction:
+        dynamic_few_shot_lambda_arn: "${DynamicFewShotFunction.Arn}"
+        
+  MonitoringLink:
+    Description: Direct link to CloudWatch logs for this function
+    Value: !Sub |
+      https://console.aws.amazon.com/cloudwatch/home?region=${AWS::Region}#logsV2:log-groups/log-group/$252Faws$252Flambda$252F${LambdaFunctionName}
\ No newline at end of file

From 75eb394632bf5bed11fea7399945f9ee0174ff62 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 14:43:28 +0100
Subject: [PATCH 03/39] chore: remove whitespace

---
 .../GENAIIDP-dynamic-few-shot.py                   |  4 ++--
 .../examples/dynamic-few-shot-lambda/README.md     | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index b2c6272d..94d9f746 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -194,7 +194,7 @@ def _extract_metadata(metadata, distance):
         image_data = _get_image_data_from_s3_path(image_path)
         encoded_images = _encode_images(image_data)
         attributes["image_content"] = encoded_images
-    
+
     return attributes
 
 def _get_image_data_from_s3_path(image_path):
@@ -227,7 +227,7 @@ def _get_image_data_from_s3_path(image_path):
         except Exception as e:
             logger.warning(f"Failed to load image {image_file_path}: {e}")
             continue
-    
+
     return image_content
 
 def _get_image_files_from_s3_path(image_path):
diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md
index a6f4219e..d30ba33d 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/README.md
+++ b/notebooks/examples/dynamic-few-shot-lambda/README.md
@@ -26,7 +26,7 @@ flowchart TD
     A[Document Processing] --> B{Dynamic-few shot configured?}
     B -->|No| C[Use Default Extraction]
     B -->|Yes| D[Invoke Dynamic-few shot Lambda]
-    
+
     subgraph Lambda
         D --> E[Receive Document Images]
         E --> F[Generate Embeddings with Nova]
@@ -35,27 +35,27 @@ flowchart TD
         H --> I[Load Example Images from S3]
         I --> J[Format Examples for Bedrock]
     end
-    
+
     J --> K[Use Examples in Extraction Prompt]
     C --> L[Continue with Standard Extraction]
     K --> L
-    
+
     subgraph Input
         M[Document Class]
         N[Document Text]
         O[Document Images]
     end
-    
+
     subgraph Output
         P[Example Attributes Prompts]
         Q[Example Images]
         R[Similarity Distances]
     end
-    
+
     D -.-> M
     D -.-> N
     D -.-> O
-    
+
     J -.-> P
     J -.-> Q
     J -.-> R
@@ -163,7 +163,7 @@ def merge_examples(combined_examples, new_examples):
         if combined_examples.get(key):
             # Keep the better (lower) distance score
             combined_examples[key]["distance"] = min(
-                new_example.get("distance"), 
+                new_example.get("distance"),
                 combined_examples[key]["distance"]
             )
 ```

From fbe11b2712b9adc9117700f981abf3a5bc412c35 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 28 Nov 2025 21:40:32 +0100
Subject: [PATCH 04/39] feat: add support for Amazon Titan Multimodal
 Embeddings G1 and Amazon Nova Multimodal Embeddings

---
 .../idp_common/bedrock/README.md              | 41 ++++++++++
 .../idp_common/bedrock/client.py              | 78 +++++++++++++++++--
 2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/lib/idp_common_pkg/idp_common/bedrock/README.md b/lib/idp_common_pkg/idp_common/bedrock/README.md
index 58c5bd64..b0a67cf7 100644
--- a/lib/idp_common_pkg/idp_common/bedrock/README.md
+++ b/lib/idp_common_pkg/idp_common/bedrock/README.md
@@ -73,6 +73,47 @@ embedding = client.generate_embedding(
 # Use embedding for vector search, clustering, etc.
 ```
 
+Amazon Titan Multimodal Embeddings support both text and image at the same time. The resulting embeddings vector averages the text embeddings and image embeddings vectors.
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+with open("/path/to/document.png", "rb") as image_file:
+    image_data = image_file.read()
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    text="This document contains information about loan applications.",
+    image_source=image_data,
+    model_id="amazon.titan-embed-image-v1"
+)
+```
+
+The image source can also be an S3 URI:
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    image_data="s3://bucket/key",
+    model_id="amazon.titan-embed-image-v1"
+)
+```
+
+Amazon Nova Multimodal Embeddings with 3072 dimension size:
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    image_data="s3://bucket/key",
+    model_id="amazon.nova-2-multimodal-embeddings-v1:0",
+    dimensions=3072
+)
+```
+
 ## Prompt Caching with CachePoint
 
 Prompt caching is a powerful feature in Amazon Bedrock that significantly reduces response latency for workloads with repetitive contexts. The Bedrock client provides built-in support for this via the `<<CACHEPOINT>>` tag.
diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py
index 3f19ffe5..42d0df2a 100644
--- a/lib/idp_common_pkg/idp_common/bedrock/client.py
+++ b/lib/idp_common_pkg/idp_common/bedrock/client.py
@@ -16,6 +16,7 @@
 import copy
 import random
 import socket
+import base64
 from typing import Dict, Any, List, Optional, Union, Tuple, Type
 from botocore.config import Config
 from botocore.exceptions import (
@@ -25,7 +26,10 @@
     EndpointConnectionError,
 )
 from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError
-
+from idp_common.image import (
+    prepare_image,
+    prepare_bedrock_image_attachment
+)
 
 # Dummy exception classes for requests timeouts if requests is not available
 class _RequestsReadTimeout(Exception):
@@ -711,22 +715,29 @@ def get_guardrail_config(self) -> Optional[Dict[str, str]]:
 
     def generate_embedding(
         self,
-        text: str,
+        text: str = "",
+        image_source: Optional[Union[str, bytes]] = None,
         model_id: str = "amazon.titan-embed-text-v1",
+        dimensions: int = 1024,
         max_retries: Optional[int] = None,
     ) -> List[float]:
         """
-        Generate an embedding vector for the given text using Amazon Bedrock.
+        Generate an embedding vector for the given text or image_source using Amazon Bedrock.
+        At least one of text or the image is required to generate the embedding.
+        For Titan Multimodal embedding models, you can include both to create an embeddings query vector that averages the resulting text embeddings and image embeddings vectors.
+        For Nova Multimodal embedding models, exactly one of text or the image must be present, but not both.
 
         Args:
             text: The text to generate embeddings for
+            image_source: The image to generate embeddings for (can be either an S3 URI (s3://bucket/key) or raw image bytes)
             model_id: The embedding model ID to use (default: amazon.titan-embed-text-v1)
             max_retries: Optional override for the instance's max_retries setting
+            dimensions: Length of the output embeddings vector
 
         Returns:
             List of floats representing the embedding vector
         """
-        if not text or not isinstance(text, str):
+        if (not text or not isinstance(text, str)) and (not image_source):
             # Return an empty vector for empty input
             return []
 
@@ -741,12 +752,61 @@ def generate_embedding(
         # Normalize whitespace and prepare the input text
         normalized_text = " ".join(text.split())
 
+        # Convert image to base64
+        if image_source:
+            image_bytes = prepare_image(image_source)
+            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+
+        dimensions = int(dimensions)
+
         # Prepare the request body based on the model
-        if "amazon.titan-embed" in model_id:
-            request_body = json.dumps({"inputText": normalized_text})
+        payload_body: Dict[str, Any] = {}
+
+        if "amazon.titan-embed-text" in model_id:
+            if not normalized_text:
+                raise ValueError(
+                    "Amazon Titan Text models require a text parameter to generate embeddings for."
+                )
+            payload_body = {
+                "inputText": normalized_text,
+                "dimensions":  dimensions,
+            }
+        elif "amazon.titan-embed-image" in model_id:
+            payload_body = {
+                "embeddingConfig": {
+                    "outputEmbeddingLength": dimensions,
+                }
+            }
+            if normalized_text:
+                payload_body["inputText"] = normalized_text
+            if image_base64:
+                payload_body["inputImage"] = image_base64
+        elif "amazon.nova-2-multimodal-embeddings" in model_id:
+            if normalized_text and image_source:
+                raise ValueError(
+                    "Amazon Nova Multimodal Embedding models require exactly one of text or image parameter, but noth both at the same time."
+                )
+            payload_body = {
+                "taskType": "SINGLE_EMBEDDING",
+                "singleEmbeddingParams": {
+                    "embeddingPurpose": "GENERIC_INDEX",
+                    "embeddingDimension": dimensions,
+                }
+            }
+            if normalized_text:
+                payload_body["singleEmbeddingParams"]["text"] = {"truncationMode": "END", "value": normalized_text}
+            if image_source:
+                payload_body["singleEmbeddingParams"].update(prepare_bedrock_image_attachment(image_bytes)) # detect image format
+                payload_body["singleEmbeddingParams"]["image"]["source"]["bytes"] = image_base64
         else:
             # Default format for other models
-            request_body = json.dumps({"text": normalized_text})
+            if not normalized_text:
+                raise ValueError(
+                    "Default format requires a text parameter to generate embeddings for."
+                )
+            payload_body = {"text": normalized_text}
+
+        request_body = json.dumps(payload_body)
 
         # Call the recursive embedding function
         return self._generate_embedding_with_retry(
@@ -805,6 +865,10 @@ def _generate_embedding_with_retry(
             # Handle different response formats based on the model
             if "amazon.titan-embed" in model_id:
                 embedding = response_body.get("embedding", [])
+            elif "amazon.titan-embed-image" in model_id:
+                embedding = response_body.get("embedding", [])
+            elif "amazon.nova-2-multimodal-embeddings" in model_id:
+                embedding = response_body["embeddings"][0]["embedding"]
             else:
                 # Default extraction format
                 embedding = response_body.get("embedding", [])

From 94d33e7e966924ceda88200db5de2d88ea894f3a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 28 Nov 2025 22:56:01 +0100
Subject: [PATCH 05/39] chore: move idp_common.image import to
 generate_embedding function, otherwise bedrock client would always require
 PIL dependency

---
 lib/idp_common_pkg/idp_common/bedrock/client.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py
index 42d0df2a..9167a4ce 100644
--- a/lib/idp_common_pkg/idp_common/bedrock/client.py
+++ b/lib/idp_common_pkg/idp_common/bedrock/client.py
@@ -26,10 +26,6 @@
     EndpointConnectionError,
 )
 from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError
-from idp_common.image import (
-    prepare_image,
-    prepare_bedrock_image_attachment
-)
 
 # Dummy exception classes for requests timeouts if requests is not available
 class _RequestsReadTimeout(Exception):
@@ -737,6 +733,11 @@ def generate_embedding(
         Returns:
             List of floats representing the embedding vector
         """
+        from idp_common.image import (
+            prepare_image,
+            prepare_bedrock_image_attachment
+        )
+
         if (not text or not isinstance(text, str)) and (not image_source):
             # Return an empty vector for empty input
             return []

From 182ec1b8c4a309bb3853f31f852db9f9a9263e35 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 28 Nov 2025 22:59:54 +0100
Subject: [PATCH 06/39] feat: add notebook to ingest FATURA2 dataset into S3
 vectors

---
 notebooks/misc/fewshot_dataset_import.ipynb | 487 ++++++++++++++++++++
 1 file changed, 487 insertions(+)
 create mode 100644 notebooks/misc/fewshot_dataset_import.ipynb

diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/notebooks/misc/fewshot_dataset_import.ipynb
new file mode 100644
index 00000000..baca9464
--- /dev/null
+++ b/notebooks/misc/fewshot_dataset_import.ipynb
@@ -0,0 +1,487 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Few-shot Dataset Import to S3 Vector store\n",
+    "\n",
+    "This notebook demonstrates how to import the FATURA2 dataset into S3 Vectors for use with the examples-provider Lambda function.\n",
+    "\n",
+    "The FATURA2 dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n",
+    "\n",
+    "## Process Overview:\n",
+    "\n",
+    "1. **Load FATURA2 Dataset** - Download and process the dataset\n",
+    "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n",
+    "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n",
+    "4. **Verify Import** - Test similarity search functionality\n",
+    "\n",
+    "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's make sure that modules are autoreloaded\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "ROOTDIR=\"../..\"\n",
+    "# First uninstall existing package (to ensure we get the latest version)\n",
+    "%pip uninstall -y idp_common\n",
+    "\n",
+    "# Install the IDP common package with all components in development mode\n",
+    "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n",
+    "\n",
+    "# Note: We can also install specific components like:\n",
+    "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n",
+    "\n",
+    "# Check installed version\n",
+    "%pip show idp_common | grep -E \"Version|Location\"\n",
+    "\n",
+    "# Install required packages\n",
+    "%pip install -q pillow requests tqdm pandas\n",
+    "\n",
+    "# Optionally use a .env file for environment variables\n",
+    "try:\n",
+    "    from dotenv import load_dotenv\n",
+    "    load_dotenv()  \n",
+    "except ImportError:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import zipfile\n",
+    "import requests\n",
+    "from pathlib import Path\n",
+    "from typing import Dict, List, Any\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "\n",
+    "import boto3\n",
+    "from PIL import Image\n",
+    "\n",
+    "# Import IDP common modules\n",
+    "from idp_common import bedrock\n",
+    "\n",
+    "print(\"Libraries imported successfully\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Configure S3 Vectors and Bedrock"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n",
+    "S3_BUCKET_FOR_IMAGES = \"genaiidp-dynamic-few-shot-dynamicfewshotdatasetbuc-nuz4jeue5hds\" # Stack output 'DynamicFewShotDatasetBucket'\n",
+    "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n",
+    "S3_VECTORS_INDEX = \"documents\"\n",
+    "\n",
+    "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n",
+    "EMBEDDING_DIMENSIONS = 3072\n",
+    "\n",
+    "# Initialize clients\n",
+    "s3vectors_client = boto3.client('s3vectors')\n",
+    "s3_client = boto3.client('s3')\n",
+    "bedrock_client = bedrock.BedrockClient()\n",
+    "\n",
+    "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n",
+    "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n",
+    "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Load FATURA2 Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download and extract FATURA2 dataset from Zenodo\n",
+    "print(\"Downloading FATURA2 dataset...\")\n",
+    "\n",
+    "# Configuration for this dataset\n",
+    "IMAGE_VARIANT = 'colored_images'\n",
+    "ANNOTATION_VARIANT = 'Original_Format'\n",
+    "CLASS_LABEL = 'invoice'\n",
+    "\n",
+    "# Create datasets directory\n",
+    "datasets_dir = Path('datasets')\n",
+    "datasets_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "# Download the zip file\n",
+    "zip_url = 'https://zenodo.org/records/10371464/files/FATURA2.zip?download=1'\n",
+    "zip_path = datasets_dir / 'FATURA2.zip'\n",
+    "\n",
+    "if not zip_path.exists():\n",
+    "    response = requests.get(zip_url, stream=True)\n",
+    "    response.raise_for_status()\n",
+    "    \n",
+    "    with open(zip_path, 'wb') as f:\n",
+    "        for chunk in tqdm(response.iter_content(chunk_size=8192), desc='Downloading'):\n",
+    "            f.write(chunk)\n",
+    "    print(f\"Downloaded {zip_path}\")\n",
+    "else:\n",
+    "    print(f\"Using existing {zip_path}\")\n",
+    "\n",
+    "# Extract the zip file\n",
+    "extract_dir = datasets_dir / 'invoices_dataset_final'\n",
+    "if not extract_dir.exists():\n",
+    "    with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
+    "        zip_ref.extractall(datasets_dir)\n",
+    "    print(f\"Extracted to {extract_dir}\")\n",
+    "else:\n",
+    "    print(f\"Using existing {extract_dir}\")\n",
+    "\n",
+    "colored_images = extract_dir / IMAGE_VARIANT\n",
+    "\n",
+    "# Load images from extracted directory\n",
+    "image_files = list(colored_images.glob('**/*.jpg'))\n",
+    "print(f\"Found {len(image_files)} {IMAGE_VARIANT} files\")\n",
+    "\n",
+    "# Show sample\n",
+    "if image_files:\n",
+    "    sample_image = Image.open(image_files[0])\n",
+    "    print(f\"Sample image: {image_files[0].name}\")\n",
+    "    print(f\"Image size: {sample_image.size}\")\n",
+    "\n",
+    "print(f\"Image variant: {IMAGE_VARIANT}\")\n",
+    "print(f\"Annotation variant: {ANNOTATION_VARIANT}\")\n",
+    "print(f\"Class label: {CLASS_LABEL}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Process Dataset and Generate Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n",
+    "    \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n",
+    "    s3_client.put_object(\n",
+    "        Bucket=S3_BUCKET_FOR_IMAGES,\n",
+    "        Key=s3_key,\n",
+    "        Body=image_bytes,\n",
+    "        ContentType='image/jpeg'\n",
+    "    )\n",
+    "    return f\"s3://{S3_BUCKET_FOR_IMAGES}/{s3_key}\"\n",
+    "\n",
+    "def load_split(extract_dir, split_name):\n",
+    "    csv_path = extract_dir / (split_name + \".csv\")\n",
+    "    return pd.read_csv(csv_path)\n",
+    "\n",
+    "def read_annotation(extract_dir, annot_path):\n",
+    "    json_path = extract_dir / \"Annotations\" / ANNOTATION_VARIANT / annot_path\n",
+    "    with open(json_path, \"r\") as f:\n",
+    "        annotation = f.read()\n",
+    "    return json.loads(annotation)\n",
+    "\n",
+    "def load_image(extract_dir, img_path):\n",
+    "    image_path = extract_dir / IMAGE_VARIANT / img_path\n",
+    "    with open(image_path, \"rb\") as f:\n",
+    "        image_content = f.read()\n",
+    "    return image_content\n",
+    "\n",
+    "def map_labels(annotations):\n",
+    "    labels = {}\n",
+    "    labels['invoice_number'] = annotations.get(\"NUMBER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['invoice_date'] = annotations.get(\"DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['due_date'] = annotations.get(\"DUE_DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['vendor_name'] = annotations.get(\"SELLER_NAME\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['vendor_address'] = annotations.get(\"SELLER_ADDRESS\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    BUYER = annotations.get(\"BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['customer_name'] = BUYER[0] if len(BUYER) > 0 else []\n",
+    "    labels['customer_address'] = BUYER[1:] if len(BUYER) > 1 else []\n",
+    "    labels['items'] = \"null\"\n",
+    "    labels['quantities'] = \"null\"\n",
+    "    labels['unit_prices'] = \"null\"\n",
+    "    labels['subtotal'] = annotations.get(\"SUB_TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['tax'] = annotations.get(\"TAX\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['total_amount'] = annotations.get(\"TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['payment_terms'] = annotations.get(\"NOTE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    labels['po_number'] = annotations.get(\"GSTIN_BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
+    "    return labels\n",
+    "\n",
+    "def get_attributes_prompt(labels):\n",
+    "    attributes_prompt = f\"\"\"expected attributes are:\n",
+    "        \"invoice_number\": {\", \".join(labels['invoice_number'])}\n",
+    "        \"invoice_date\": {\", \".join(labels['invoice_date'])}\n",
+    "        \"due_date\": {\", \".join(labels['due_date'])}\n",
+    "        \"vendor_name\": {\", \".join(labels['vendor_name'])}\n",
+    "        \"vendor_address\": {\", \".join(labels['vendor_address'])}\n",
+    "        \"customer_name\": {labels['customer_name']}\n",
+    "        \"customer_address\": {\", \".join(labels['customer_address'])}\n",
+    "        \"items\": {labels['items']}\n",
+    "        \"quantities\": {labels['quantities']}\n",
+    "        \"unit_prices\": {labels['unit_prices']}\n",
+    "        \"subtotal\": {\", \".join(labels['subtotal'])}\n",
+    "        \"tax\": {\", \".join(labels['tax'])}\n",
+    "        \"total_amount\": {\", \".join(labels['total_amount'])}\n",
+    "        \"payment_terms\": {\", \".join(labels['payment_terms'])}\n",
+    "        \"po_number\": {\", \".join(labels['po_number'])}\n",
+    "    \"\"\".strip()\n",
+    "    return attributes_prompt\n",
+    "\n",
+    "def create_metadata(annotations: Dict, s3_image_uri: str) -> Dict:\n",
+    "    \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n",
+    "    class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n",
+    "\n",
+    "    labels = map_labels(annotations)\n",
+    "    attributes_prompt = get_attributes_prompt(labels)\n",
+    "\n",
+    "    return {\n",
+    "        \"classLabel\": CLASS_LABEL,\n",
+    "        \"classPrompt\": class_prompt,\n",
+    "        \"attributesPrompt\": attributes_prompt,\n",
+    "        \"imagePath\": s3_image_uri,\n",
+    "    }\n",
+    "\n",
+    "print(\"Helper functions defined\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Import Dataset to S3 Vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process a subset of the dataset (adjust as needed)\n",
+    "MAX_SAMPLES = 100  # Adjust this number based on your needs\n",
+    "BATCH_SIZE = 10    # Adjust this number based on your needs\n",
+    "\n",
+    "dataset_split = load_split(extract_dir, \"strat1_train\")\n",
+    "samples_to_process = min(MAX_SAMPLES, len(dataset_split))\n",
+    "\n",
+    "print(f\"Processing {samples_to_process} samples from FATURA2 dataset...\")\n",
+    "\n",
+    "vectors_to_upload = []\n",
+    "failed_samples = []\n",
+    "\n",
+    "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n",
+    "    try:\n",
+    "        df_image = dataset_split.iloc[i]\n",
+    "\n",
+    "        # Load annotations\n",
+    "        annotations = read_annotation(extract_dir, df_image[\"annot_path\"])\n",
+    "        \n",
+    "        # Load image\n",
+    "        image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n",
+    "\n",
+    "        # Upload image to S3\n",
+    "        s3_key = f\"fatura2/{IMAGE_VARIANT}/{df_image['img_path']}\"\n",
+    "        s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n",
+    "        \n",
+    "        # Generate embedding\n",
+    "        embedding = bedrock_client.generate_embedding(\n",
+    "            image_source=image_bytes,\n",
+    "            model_id=EMBEDDING_MODEL_ID,\n",
+    "            dimensions=EMBEDDING_DIMENSIONS\n",
+    "        )\n",
+    "        \n",
+    "        # Create metadata\n",
+    "        metadata = create_metadata(annotations, s3_image_uri)\n",
+    "\n",
+    "        # Prepare vector for upload\n",
+    "        vector_entry = {\n",
+    "            \"key\": f\"fatura2_sample_{i:06d}\",\n",
+    "            \"data\": {\"float32\": embedding},\n",
+    "            \"metadata\": metadata\n",
+    "        }\n",
+    "\n",
+    "        vectors_to_upload.append(vector_entry)\n",
+    "        \n",
+    "        # Upload in batches to avoid memory issues\n",
+    "        if len(vectors_to_upload) >= BATCH_SIZE:  # Batch size\n",
+    "            print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n",
+    "            response = s3vectors_client.put_vectors(\n",
+    "                vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "                indexName=S3_VECTORS_INDEX,\n",
+    "                vectors=vectors_to_upload\n",
+    "            )\n",
+    "            print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
+    "            vectors_to_upload = []  # Clear batch\n",
+    "            \n",
+    "    except Exception as e:\n",
+    "        print(f\"\\nFailed to process sample {i}: {e}\")\n",
+    "        failed_samples.append(i)\n",
+    "        continue\n",
+    "\n",
+    "# Upload remaining vectors\n",
+    "if vectors_to_upload:\n",
+    "    print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n",
+    "    response = s3vectors_client.put_vectors(\n",
+    "        vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "        indexName=S3_VECTORS_INDEX,\n",
+    "        vectors=vectors_to_upload\n",
+    "    )\n",
+    "    print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
+    "\n",
+    "print(f\"\\nImport completed!\")\n",
+    "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples\")\n",
+    "print(f\"Failed samples: {len(failed_samples)}\")\n",
+    "if failed_samples:\n",
+    "    print(f\"Failed sample indices: {failed_samples[:10]}...\")  # Show first 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Verify Import with Similarity Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test similarity search with a sample from the dataset\n",
+    "test_split = load_split(extract_dir, \"strat1_test\")\n",
+    "\n",
+    "test_sample_index = 0\n",
+    "df_image = test_split.iloc[test_sample_index]\n",
+    "\n",
+    "test_image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n",
+    "\n",
+    "print(f\"Testing similarity search with sample {extract_dir / IMAGE_VARIANT / df_image['img_path']}...\")\n",
+    "\n",
+    "# Generate embedding for test image\n",
+    "test_embedding = bedrock_client.generate_embedding(\n",
+    "    image_source=test_image_bytes,\n",
+    "    model_id=EMBEDDING_MODEL_ID,\n",
+    "    dimensions=EMBEDDING_DIMENSIONS\n",
+    ")\n",
+    "\n",
+    "# Query S3 Vectors for similar examples\n",
+    "response = s3vectors_client.query_vectors(\n",
+    "    vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "    indexName=S3_VECTORS_INDEX,\n",
+    "    queryVector={\"float32\": test_embedding},\n",
+    "    topK=5,\n",
+    "    returnDistance=True,\n",
+    "    returnMetadata=True\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n",
+    "for i, vector in enumerate(response['vectors']):\n",
+    "    distance = vector.get('distance', 'N/A')\n",
+    "    key = vector.get('key', 'N/A')\n",
+    "    metadata = vector.get('metadata', {})\n",
+    "    class_label = metadata.get('classLabel', 'N/A')\n",
+    "    class_prompt = metadata.get('classPrompt', 'N/A')\n",
+    "    attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n",
+    "    image_path = metadata.get('imagePath', 'N/A')\n",
+    "    \n",
+    "    print(f\"  {i+1}. Key: {key}\")\n",
+    "    print(f\"     Distance: {distance:.4f}\")\n",
+    "    print(f\"     Class Label: {image_path}\")\n",
+    "    print(f\"     Class Prompt: {class_prompt}\")\n",
+    "    print(f\"     Attributes Prompt: {attributes_prompt}\")\n",
+    "    print(f\"     Image Path: {image_path}\")\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Summary and Next Steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== Few-shot Dataset Import Summary ===\")\n",
+    "print(f\"✅ Dataset: FATURA2 (Invoice documents)\")\n",
+    "print(f\"✅ Samples processed: {samples_to_process - len(failed_samples)}\")\n",
+    "print(f\"✅ S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n",
+    "print(f\"✅ S3 Vectors Index: {S3_VECTORS_INDEX}\")\n",
+    "print(f\"✅ Images stored in: s3://{S3_BUCKET_FOR_IMAGES}/fatura2/{IMAGE_VARIANT}/\")\n",
+    "print(f\"✅ Embedding Model: {EMBEDDING_MODEL_ID}\")\n",
+    "print(f\"✅ Similarity search verified\")\n",
+    "\n",
+    "print(\"\\n=== Next Steps ===\")\n",
+    "print(\"1. Upload your own datasets into S3 Vectors\")\n",
+    "print(\"2. Configure your IDP extraction to use the examples provider Lambda ARN\")\n",
+    "print(\"3. Test document processing with few-shot examples!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 1e7cac3df80c6cdfe3c1595f175511e1691910cb Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Mon, 1 Dec 2025 21:06:04 +0100
Subject: [PATCH 07/39] chore: update input parameter for document_text + fixes

---
 .../GENAIIDP-dynamic-few-shot.py              | 21 ++++++++++---------
 .../dynamic-few-shot-lambda/README.md         |  5 +----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index 94d9f746..61b69295 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -21,7 +21,8 @@
 from idp_common import bedrock, s3
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO"))
+logger.setLevel(level)
 
 # Parse environment variables with error handling
 try:
@@ -45,7 +46,7 @@ def lambda_handler(event, context):
     Input event:
     {
         "class_label": "<class_label>",
-        "document_texts": ["<document_text_1>", "<document_text_2>", ...],
+        "document_text": "<document_text>",
         "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
     }
 
@@ -66,13 +67,13 @@ def lambda_handler(event, context):
         
         # Validate input
         class_label = event.get("class_label")
-        document_texts = event.get("document_texts", [])
+        document_text = event.get("document_text")
         image_content = event.get("image_content", [])
 
         logger.info(f"=== INPUT VALUES ===")
         logger.info(f"Class label: {class_label if class_label else 'Not specified'}")
-        logger.info(f"Document texts: {len(document_texts)}")
-        logger.info(f"Image content: {len(image_content)}")
+        logger.info(f"Document text: {len(document_text) if document_text else "0"} bytes")
+        logger.info(f"Image content: {len(image_content)} images")
 
         # Decode input data
         image_data = _decode_images(image_content)
@@ -114,17 +115,17 @@ def _encode_images(image_content):
 
 def _s3vectors_find_similar_items(image_data):
     """Find similar items for input"""
-
     # find similar items based on image similarity only
     similar_items = {}
     for page_image in image_data:
-        result = _s3vectors_find_similar_items_from_image(image_data)
+        result = _s3vectors_find_similar_items_from_image(page_image)
         _merge_examples(similar_items, result)
 
     # create result set
     result = []
     for key, example in similar_items.items():
         metadata = example.get("metadata", {})
+        distance = example.get("distance")
         attributes_prompt = metadata.get("attributesPrompt")
 
         # Only process this example if it has a non-empty attributesPrompt
@@ -134,7 +135,7 @@ def _s3vectors_find_similar_items(image_data):
             )
             continue
 
-        attributes = _extract_metadata(metadata)
+        attributes = _extract_metadata(metadata, distance)
         result.append(attributes)
 
     return result
@@ -169,8 +170,8 @@ def _merge_examples(examples, new_examples):
         new_distance = new_example.get("distance", 1.0)
         
         # update example
-        if combined_examples.get(key):
-            existing_distance = combined_examples[key].get("distance", 1.0)
+        if examples.get(key):
+            existing_distance = examples[key].get("distance", 1.0)
             examples[key]["distance"] = min(new_distance, existing_distance)
             examples[key]["metadata"] = new_example.get("metadata")
         # insert example
diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md
index d30ba33d..e30f913e 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/README.md
+++ b/notebooks/examples/dynamic-few-shot-lambda/README.md
@@ -103,10 +103,7 @@ extraction:
 ```json
 {
   "class_label": "invoice",
-  "document_texts": [
-    "Invoice text or markdown from page 1...",
-    "Invoice text or markdown from page 2..."
-  ],
+  "document_text": "Text or markdown from section 1 (pages 1-3)...",
   "image_content": [
     "base64_encoded_image_1",
     "base64_encoded_image_2"

From 99a3605aaeebe61c07d87851b87a6b6c9b59a572 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Mon, 1 Dec 2025 21:09:19 +0100
Subject: [PATCH 08/39] feat: add notebook for dynamic few-shot Lambda testing

---
 .../config/extraction_with_few_shot.yaml      | 101 +++++
 ...ep3_extraction_with_dynamic_few_shot.ipynb | 420 ++++++++++++++++++
 2 files changed, 521 insertions(+)
 create mode 100644 notebooks/examples/config/extraction_with_few_shot.yaml
 create mode 100644 notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb

diff --git a/notebooks/examples/config/extraction_with_few_shot.yaml b/notebooks/examples/config/extraction_with_few_shot.yaml
new file mode 100644
index 00000000..addd9a01
--- /dev/null
+++ b/notebooks/examples/config/extraction_with_few_shot.yaml
@@ -0,0 +1,101 @@
+# Extraction Service Configuration
+extraction:
+  top_p: '0.1'
+  max_tokens: '4096'
+  top_k: '5'
+  temperature: '0.0'
+  model: us.amazon.nova-pro-v1:0
+  system_prompt: >-
+    You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided.
+  task_prompt: >-
+    <background>
+
+    You are an expert in document analysis and information extraction.
+    You can understand and extract key information from documents classified as type
+
+    {DOCUMENT_CLASS}.
+
+    </background>
+
+
+    <task>
+
+    Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. 
+    Then, extract the relevant information from the text and populate the corresponding values in the JSON object.
+
+    </task>
+
+
+    <extraction-guidelines>
+
+    Guidelines:
+        1. Ensure that the data is accurately represented and properly formatted within
+        the JSON structure
+        2. Include double quotes around all keys and values
+        3. Do not make up data - only extract information explicitly found in the
+        document
+        4. Do not use /n for new lines, use a space instead
+        5. If a field is not found or if unsure, return null
+        6. All dates should be in MM/DD/YYYY format
+        7. Do not perform calculations or summations unless totals are explicitly given
+        8. If an alias is not found in the document, return null
+        9. Guidelines for checkboxes:
+         9.A. CAREFULLY examine each checkbox, radio button, and selection field:
+            - Look for marks like ✓, ✗, x, filled circles (●), darkened areas, or handwritten checks indicating selection
+            - For checkboxes and multi-select fields, ONLY INCLUDE options that show clear visual evidence of selection
+            - DO NOT list options that have no visible selection mark
+         9.B. For ambiguous or overlapping tick marks:
+            - If a mark overlaps between two or more checkboxes, determine which option contains the majority of the mark
+            - Consider a checkbox selected if the mark is primarily inside the check box or over the option text
+            - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward.
+            - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment.
+        10. Think step by step first and then answer.
+
+    </extraction-guidelines>
+
+    If the attributes section below contains a list of attribute names and
+    descriptions, then output only those attributes, using the provided
+    descriptions as guidance for finding the correct values.
+
+    <attributes>
+
+    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+
+    </attributes>
+
+    <few-shot-examples>
+
+    {FEW_SHOT_EXAMPLES}
+
+    </few-shot-examples>
+
+    <<CACHEPOINT>>
+
+
+    <document-text>
+
+    {DOCUMENT_TEXT}
+
+    </document-text>
+
+
+    <document_image>
+
+    {DOCUMENT_IMAGE}
+
+    </document_image>
+
+
+    <final-instructions>
+
+    Extract key information from the document and return a JSON object with the following key steps:
+    1. Carefully analyze the document text to identify the requested attributes
+    2. Extract only information explicitly found in the document - never make up data
+    3. Format all dates as MM/DD/YYYY and replace newlines with spaces
+    4. For checkboxes, only include options with clear visual selection marks
+    5. Use null for any fields not found in the document
+    6. Ensure the output is properly formatted JSON with quoted keys and values
+    7. Think step by step before finalizing your answer
+
+    </final-instructions>
+
diff --git a/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb b/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb
new file mode 100644
index 00000000..5d5a0663
--- /dev/null
+++ b/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb
@@ -0,0 +1,420 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 3: Dynamic Few-Shot Information Extraction\n",
+    "\n",
+    "This notebook demonstrates the **dynamic few-shot prompting feature** for Pattern 2. It shows how to:\n",
+    "\n",
+    "- Configure dynamic few-shot Lambda functions extraction\n",
+    "- Compare default vs examples-enhanced extraction results\n",
+    "- Inspect Lambda payloads and responses\n",
+    "- Handle errors and monitor performance\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Completed Step 2 (Classification)\n",
+    "- AWS Lambda permissions to create/invoke functions\n",
+    "- Dynamic few-shot Lambda function deployed\n",
+    "- S3 Vectors index populated with examples (`notebooks/misc/fewshot_dataset_import.ipynb`)\n",
+    "\n",
+    "**Key Feature:**\n",
+    "The `dynamic_few_shot_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup and Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import time\n",
+    "import logging\n",
+    "import boto3\n",
+    "from pathlib import Path\n",
+    "import yaml\n",
+    "\n",
+    "# Import IDP libraries\n",
+    "from idp_common.models import Document, Status\n",
+    "from idp_common.s3 import get_json_content\n",
+    "from idp_common import extraction\n",
+    "\n",
+    "# Configure logging to see Lambda invocation details\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logging.getLogger('idp_common.extraction').setLevel(logging.INFO)\n",
+    "logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)\n",
+    "\n",
+    "print(\"Libraries imported successfully\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Previous Step Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load document from previous step\n",
+    "classification_data_dir = Path(\".data/step2_classification\")\n",
+    "\n",
+    "# Load document object from JSON\n",
+    "document_path = classification_data_dir / \"document.json\"\n",
+    "with open(document_path, 'r') as f:\n",
+    "    document = Document.from_json(f.read())\n",
+    "\n",
+    "# Load configuration directly from config files\n",
+    "config_dir = Path(\"config\")\n",
+    "CONFIG = {}\n",
+    "\n",
+    "# Load each configuration file\n",
+    "config_files = [\n",
+    "    \"extraction_with_few_shot.yaml\",\n",
+    "    \"classes.yaml\"\n",
+    "]\n",
+    "\n",
+    "for config_file in config_files:\n",
+    "    config_path = config_dir / config_file\n",
+    "    if config_path.exists():\n",
+    "        with open(config_path, 'r') as f:\n",
+    "            file_config = yaml.safe_load(f)\n",
+    "            CONFIG.update(file_config)\n",
+    "        print(f\"Loaded {config_file}\")\n",
+    "    else:\n",
+    "        print(f\"Warning: {config_file} not found\")\n",
+    "\n",
+    "# Load environment info\n",
+    "env_path = classification_data_dir / \"environment.json\"\n",
+    "with open(env_path, 'r') as f:\n",
+    "    env_info = json.load(f)\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['AWS_REGION'] = env_info['region']\n",
+    "os.environ['METRIC_NAMESPACE'] = 'IDP-Dynamic-Few-Shot'\n",
+    "\n",
+    "print(f\"Loaded document: {document.id}\")\n",
+    "print(f\"Document status: {document.status.value}\")\n",
+    "print(f\"Number of sections: {len(document.sections) if document.sections else 0}\")\n",
+    "print(f\"Loaded configuration sections: {list(CONFIG.keys())}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Configure Dynamic Few-Shot Lambda ARN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 🔧 CONFIGURATION: Set your dynamic few-shot Lambda ARN here\n",
+    "# Replace with your actual Lambda function ARN for live testing\n",
+    "\n",
+    "# Check if dynamic few-shot Lambda function exists\n",
+    "lambda_client = boto3.client('lambda')\n",
+    "DYNAMIC_FEW_SHOT_LAMBDA_ARN = None\n",
+    "\n",
+    "try:\n",
+    "    response = lambda_client.get_function(FunctionName='GENAIIDP-dynamic-few-shot')\n",
+    "    DYNAMIC_FEW_SHOT_LAMBDA_ARN = response['Configuration']['FunctionArn']\n",
+    "    print(f\"✅ Found dynamic few-shot Lambda function: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
+    "except lambda_client.exceptions.ResourceNotFoundException:\n",
+    "    print(\"⚠️  Dynamic Few-Shot Lambda function not found: GENAIIDP-dynamic-few-shot\")\n",
+    "    print(\"💡 Deploy using: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error checking Lambda function: {e}\")\n",
+    "\n",
+    "if not DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n",
+    "    print(\"⚠️  No dynamic few-shot Lambda ARN configured\")\n",
+    "    print(\"💡 This demo will show standard extraction without few-shot examples\")\n",
+    "    print(\"🔧 To test with examples, deploy the dynamic few-shot Lambda first\")\n",
+    "else:\n",
+    "    print(f\"✅ Dynamic few-shot Lambda ARN configured: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
+    "    print(\"🚀 This demo will use few-shot examples from S3 Vectors\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Extraction Comparison: Default vs Dynamic Few-Shot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.1 Default Extraction (Without Dynamic Few-Shot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create configuration WITHOUT dynamic few-shot Lambda\n",
+    "config_default = CONFIG.copy()\n",
+    "if 'dynamic_few_shot_lambda_arn' in config_default.get('extraction', {}):\n",
+    "    del config_default['extraction']['dynamic_few_shot_lambda_arn']\n",
+    "\n",
+    "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n",
+    "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n",
+    "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('dynamic_few_shot_lambda_arn', 'None')}\")\n",
+    "\n",
+    "# Create extraction service with default config\n",
+    "extraction_service_default = extraction.ExtractionService(config=config_default)\n",
+    "print(\"\\n✅ Default extraction service initialized\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run default extraction on first section\n",
+    "if document.sections:\n",
+    "    first_section = document.sections[0]\n",
+    "    print(f\"🔄 Processing section {first_section.section_id} with DEFAULT prompts\")\n",
+    "    print(f\"Classification: {first_section.classification}\")\n",
+    "    print(f\"Pages: {first_section.page_ids}\")\n",
+    "    \n",
+    "    # Save original document state\n",
+    "    document_default = Document.from_json(document.to_json())\n",
+    "    \n",
+    "    # Process with default extraction\n",
+    "    start_time = time.time()\n",
+    "    document_default = extraction_service_default.process_document_section(\n",
+    "        document=document_default,\n",
+    "        section_id=first_section.section_id\n",
+    "    )\n",
+    "    default_extraction_time = time.time() - start_time\n",
+    "    \n",
+    "    print(f\"✅ Default extraction completed in {default_extraction_time:.2f} seconds\")\n",
+    "\n",
+    "    # Store results for comparison\n",
+    "    default_section_result = None\n",
+    "    for section in document_default.sections:\n",
+    "        if section.section_id == first_section.section_id:\n",
+    "            default_section_result = section\n",
+    "            break\n",
+    "            \n",
+    "else:\n",
+    "    print(\"⚠️ No sections found in document\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show section extraction result\n",
+    "if default_section_result:\n",
+    "    print(f\"\\nSection {default_section_result.section_id} extraction result:\")\n",
+    "    extraction_result_uri = default_section_result.extraction_result_uri\n",
+    "\n",
+    "    if extraction_result_uri:\n",
+    "        result = get_json_content(extraction_result_uri)\n",
+    "        result_json = json.dumps(result[\"inference_result\"], indent=2)\n",
+    "        print(result_json)\n",
+    "\n",
+    "else:\n",
+    "    print(\"⚠️ No sections found in document\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.2 Dynamic Few-Shot Extraction using Lambda"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n",
+    "    # Create configuration WITH dynamic few-shot Lambda\n",
+    "    config_few_shot = CONFIG.copy()\n",
+    "    config_few_shot['extraction']['dynamic_few_shot_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n",
+    "    \n",
+    "    print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n",
+    "    print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n",
+    "    print(f\"Dynamic Few-Shot Lambda: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
+    "    print(f\"Lambda Function Name: {DYNAMIC_FEW_SHOT_LAMBDA_ARN.split(':')[-1]}\")\n",
+    "    \n",
+    "    # Create extraction service with dynamic few-shot config\n",
+    "    extraction_service_few_shot = extraction.ExtractionService(config=config_few_shot)\n",
+    "    \n",
+    "    print(\"\\n✅ Dynamic few-shot extraction service initialized\")\n",
+    "    \n",
+    "else:\n",
+    "    print(\"⚠️ No dynamic few-shot Lambda ARN configured - skipping demonstration\")\n",
+    "    config_few_shot = None\n",
+    "    extraction_service_few_shot = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run dynamic few-shot extraction on first section\n",
+    "if DYNAMIC_FEW_SHOT_LAMBDA_ARN and document.sections:\n",
+    "    first_section = document.sections[0]\n",
+    "    print(f\"🔄 Processing section {first_section.section_id} with DYNAMIC FEW-SHOT\")\n",
+    "    print(f\"Classification: {first_section.classification}\")\n",
+    "    print(f\"Pages: {first_section.page_ids}\")\n",
+    "    \n",
+    "    # Create fresh document copy for examples processing\n",
+    "    document_few_shot = Document.from_json(document.to_json())\n",
+    "    \n",
+    "    # Process with dynamic few-shot extraction\n",
+    "    start_time = time.time()\n",
+    "    \n",
+    "    try:\n",
+    "        document_few_shot = extraction_service_few_shot.process_document_section(\n",
+    "            document=document_few_shot,\n",
+    "            section_id=first_section.section_id\n",
+    "        )\n",
+    "        few_shot_extraction_time = time.time() - start_time\n",
+    "        \n",
+    "        print(f\"✅ Dynamic few-shot extraction completed in {few_shot_extraction_time:.2f} seconds\")\n",
+    "        \n",
+    "        # Store results for comparison\n",
+    "        few_shot_section_result = None\n",
+    "        for section in document_few_shot.sections:\n",
+    "            if section.section_id == first_section.section_id:\n",
+    "                few_shot_section_result = section\n",
+    "                break\n",
+    "                \n",
+    "        # Performance comparison\n",
+    "        overhead = few_shot_extraction_time - default_extraction_time\n",
+    "        print(f\"\\n📊 Performance Comparison:\")\n",
+    "        print(f\"   Default: {default_extraction_time:.2f}s\")\n",
+    "        print(f\"   Dynamic Few-Shot: {few_shot_extraction_time:.2f}s\")\n",
+    "        print(f\"   Dynamic Few-Shot Overhead: {overhead:.2f}s ({overhead/default_extraction_time*100:.1f}% increase)\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Dynamic few-shot extraction failed: {e}\")\n",
+    "        print(\"\\n🔍 This demonstrates the fail-fast error handling behavior\")\n",
+    "        few_shot_section_result = None\n",
+    "        few_shot_extraction_time = None\n",
+    "        \n",
+    "else:\n",
+    "    print(\"⚠️ Skipping dynamic few-shot extraction (no Lambda configured or no sections)\")\n",
+    "    document_few_shot = None\n",
+    "    few_shot_section_result = None\n",
+    "    few_shot_extraction_time = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show section extraction result\n",
+    "if few_shot_section_result:\n",
+    "    print(f\"\\nSection {few_shot_section_result.section_id} extraction result:\")\n",
+    "    extraction_result_uri = few_shot_section_result.extraction_result_uri\n",
+    "\n",
+    "    if extraction_result_uri:\n",
+    "        result = get_json_content(extraction_result_uri)\n",
+    "        result_json = json.dumps(result[\"inference_result\"], indent=2)\n",
+    "        print(result_json)\n",
+    "\n",
+    "else:\n",
+    "    print(\"⚠️ No sections found in document\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Results and Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== DEMO COMPLETE: SUMMARY ===\")\n",
+    "\n",
+    "sections_processed = 1 if document.sections else 0\n",
+    "dynamic_few_shot_used = DYNAMIC_FEW_SHOT_LAMBDA_ARN is not None\n",
+    "\n",
+    "print(f\"\\n✅ DEMO RESULTS:\")\n",
+    "print(f\"   📄 Document processed: {document.id}\")\n",
+    "print(f\"   📊 Sections processed: {sections_processed}\")\n",
+    "print(f\"   🔧 Dynamic Few-Shot used: {'Yes' if dynamic_few_shot_used else 'No'}\")\n",
+    "\n",
+    "if dynamic_few_shot_used and 'few_shot_extraction_time' in locals() and examples_extraction_time:\n",
+    "    print(f\"   ⏱️  Performance overhead: {few_shot_extraction_time - default_extraction_time:.2f}s\")\n",
+    "    print(f\"   📈 Accuracy improvement: Enhanced with few-shot examples\")\n",
+    "\n",
+    "print(f\"\\n🚀 TO IMPLEMENT DYNAMIC FEW-SHOT IN PRODUCTION:\")\n",
+    "print(f\"   1. 📝 Deploy dynamic few-shot Lambda stack\")\n",
+    "print(f\"   2. 📊 Populate S3 Vectors index with example documents\")\n",
+    "print(f\"   3. ⚙️  Add 'dynamic_few_shot_lambda_arn' to extraction config\")\n",
+    "print(f\"   4. 🧪 Test with your actual documents and use cases\")\n",
+    "print(f\"   5. 📊 Monitor CloudWatch logs for performance and accuracy\")\n",
+    "\n",
+    "print(f\"\\n📚 RESOURCES:\")\n",
+    "print(f\"   📖 Documentation: notebooks/examples/dynamic-few-shot-lambda/README.md\")\n",
+    "print(f\"   🔧 Lambda Function: notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py\")\n",
+    "print(f\"   ☁️  Deploy: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n",
+    "print(f\"   📊 Import Dataset: notebooks/misc/fewshot_dataset_import.ipynb\")\n",
+    "\n",
+    "print(f\"\\n📌 CONTINUE TO: step4_assessment.ipynb\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From ae2a9250e559a5c553b39b74a3da14ea104086b2 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 2 Dec 2025 09:54:42 +0100
Subject: [PATCH 09/39] chore: placeholder bucket name

---
 notebooks/misc/fewshot_dataset_import.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/notebooks/misc/fewshot_dataset_import.ipynb
index baca9464..dc56a646 100644
--- a/notebooks/misc/fewshot_dataset_import.ipynb
+++ b/notebooks/misc/fewshot_dataset_import.ipynb
@@ -105,7 +105,7 @@
    "outputs": [],
    "source": [
     "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n",
-    "S3_BUCKET_FOR_IMAGES = \"genaiidp-dynamic-few-shot-dynamicfewshotdatasetbuc-nuz4jeue5hds\" # Stack output 'DynamicFewShotDatasetBucket'\n",
+    "S3_BUCKET_FOR_IMAGES = \"<your-dynamic-few-shot-bucket>\" # Stack output 'DynamicFewShotDatasetBucket'\n",
     "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n",
     "S3_VECTORS_INDEX = \"documents\"\n",
     "\n",

From bd52a2260b17d2b06e03480e4a36720550adfb3c Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 2 Dec 2025 10:02:58 +0100
Subject: [PATCH 10/39] chore: clarify distance

---
 .../GENAIIDP-dynamic-few-shot.py                      | 11 +++++++----
 notebooks/examples/dynamic-few-shot-lambda/README.md  |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index 61b69295..a97fff33 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -45,8 +45,8 @@ def lambda_handler(event, context):
 
     Input event:
     {
-        "class_label": "<class_label>",
-        "document_text": "<document_text>",
+        "class_label": "<class_label e.g. invoice>",
+        "document_text": "<document_text e.g. plain text or markdown from section 1 (pages 1-3)...>",
         "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
     }
 
@@ -55,7 +55,7 @@ def lambda_handler(event, context):
         {
             "attributes_prompt": "expected attributes are: ...",
             "class_prompt": "This is an example of the class 'invoice'",
-            "distance": 0.892344521145,
+            "distance": 0.122344521145,
             "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
         }
     ]
@@ -138,7 +138,10 @@ def _s3vectors_find_similar_items(image_data):
         attributes = _extract_metadata(metadata, distance)
         result.append(attributes)
 
-    return result
+    # sort results by distance score (lowest to highest - lower is more similar)
+    sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False)
+
+    return sorted_result
 
 def _s3vectors_find_similar_items_from_image(page_image):
     """Search for similar items using image query"""
diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md
index e30f913e..098be753 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/README.md
+++ b/notebooks/examples/dynamic-few-shot-lambda/README.md
@@ -117,7 +117,7 @@ extraction:
   {
     "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...",
     "class_prompt": "This is an example of the class 'invoice'",
-    "distance": 0.892344521145,
+    "distance": 0.122344521145, # lower is more similar
     "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
   }
 ]

From 289386bc94a4d7ea7516a7bbf77f0e988a2f5431 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 2 Dec 2025 14:05:40 +0100
Subject: [PATCH 11/39] chore: debug log for S3 vectors result

---
 .../dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index a97fff33..7f234c6d 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -158,6 +158,7 @@ def _s3vectors_find_similar_items_from_image(page_image):
         returnDistance=True,
         returnMetadata=True
     )
+    logger.debug(f"S3 vectors lookup result: {response['vectors']}")
     return response["vectors"]
 
 def _merge_examples(examples, new_examples):

From 6fd1b5ee4be30d5d5f9a06752718a6880e71f0ef Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 2 Dec 2025 14:05:57 +0100
Subject: [PATCH 12/39] chore: filter S3 vectors result by threshold

---
 .../GENAIIDP-dynamic-few-shot.py                | 17 +++++++++++++++--
 .../dynamic-few-shot-lambda/template.yml        |  7 +++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index 7f234c6d..495899bc 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -31,6 +31,7 @@
     S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS'])
     MODEL_ID = os.environ['MODEL_ID']
     TOP_K = int(os.environ['TOP_K'])
+    THRESHOLD = float(os.environ['THRESHOLD'])
 except (KeyError, ValueError, IndexError) as e:
     logger.error(f"Failed to parse environment variables: {e}")
     raise
@@ -64,7 +65,7 @@ def lambda_handler(event, context):
     try:
         logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===")
         logger.debug(f"Complete input event: {json.dumps(event, indent=2)}")
-        
+
         # Validate input
         class_label = event.get("class_label")
         document_text = event.get("document_text")
@@ -78,6 +79,8 @@ def lambda_handler(event, context):
         # Decode input data
         image_data = _decode_images(image_content)
 
+        logger.info(f"=== FIND SIMILAR ITEMS ===")
+
         # Find similar items using S3 vectors lookup from image similarity
         result = _s3vectors_find_similar_items(image_data)
 
@@ -141,7 +144,17 @@ def _s3vectors_find_similar_items(image_data):
     # sort results by distance score (lowest to highest - lower is more similar)
     sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False)
 
-    return sorted_result
+    # filter result by distance score
+    filtered_result = []
+    for example in sorted_result:
+        if example['distance'] > THRESHOLD:
+            logger.info(
+                f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}"
+            )
+        else:
+            filtered_result.append(example)
+
+    return filtered_result
 
 def _s3vectors_find_similar_items_from_image(page_image):
     """Search for similar items using image query"""
diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/notebooks/examples/dynamic-few-shot-lambda/template.yml
index 927c9a65..2c5158da 100644
--- a/notebooks/examples/dynamic-few-shot-lambda/template.yml
+++ b/notebooks/examples/dynamic-few-shot-lambda/template.yml
@@ -35,6 +35,12 @@ Parameters:
   TopK:
     Type: Number
     Default: 2
+    Description: The number of results to return for each S3 vectors query.
+
+  Threshold:
+    Type: Number
+    Default: 0.2
+    Description: Filter results exceeding this similarity threshold (lower is more similar)
 
   LambdaFunctionName:
     Type: String
@@ -79,6 +85,7 @@ Resources:
           S3VECTOR_DIMENSIONS: !Ref VectorDimensions
           MODEL_ID: !Ref ModelId
           TOP_K: !Ref TopK
+          THRESHOLD: !Ref Threshold
       LoggingConfig:
         LogGroup: !Ref DynamicFewShotLogGroup
       # Minimal permissions - only needs basic execution and logging

From 8d16da197f09deebdeea14d27569a54e19804742 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 4 Dec 2025 09:01:50 +0100
Subject: [PATCH 13/39] chore: add comment on PIL requirement for
 generate_embedding

---
 lib/idp_common_pkg/idp_common/bedrock/client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py
index 9167a4ce..7e14d4bb 100644
--- a/lib/idp_common_pkg/idp_common/bedrock/client.py
+++ b/lib/idp_common_pkg/idp_common/bedrock/client.py
@@ -733,6 +733,7 @@ def generate_embedding(
         Returns:
             List of floats representing the embedding vector
         """
+        # requires PIL
         from idp_common.image import (
             prepare_image,
             prepare_bedrock_image_attachment

From 0b7a57d4dbbd561e3f61572e65f82e2a7a1e651b Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 14:54:13 +0100
Subject: [PATCH 14/39] chore: move dynamic-few-shot to plugins folder

---
 plugins/dynamic-few-shot-lambda/.gitignore                       | 1 +
 .../dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py         | 0
 .../examples => plugins}/dynamic-few-shot-lambda/README.md       | 0
 .../notebooks}/config/extraction_with_few_shot.yaml              | 0
 .../notebooks}/fewshot_dataset_import.ipynb                      | 0
 .../notebooks}/step3_extraction_with_dynamic_few_shot.ipynb      | 0
 .../dynamic-few-shot-lambda/requirements.txt                     | 0
 .../examples => plugins}/dynamic-few-shot-lambda/samconfig.toml  | 0
 .../examples => plugins}/dynamic-few-shot-lambda/template.yml    | 0
 9 files changed, 1 insertion(+)
 create mode 100644 plugins/dynamic-few-shot-lambda/.gitignore
 rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py (100%)
 rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/README.md (100%)
 rename {notebooks/examples => plugins/dynamic-few-shot-lambda/notebooks}/config/extraction_with_few_shot.yaml (100%)
 rename {notebooks/misc => plugins/dynamic-few-shot-lambda/notebooks}/fewshot_dataset_import.ipynb (100%)
 rename {notebooks/examples => plugins/dynamic-few-shot-lambda/notebooks}/step3_extraction_with_dynamic_few_shot.ipynb (100%)
 rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/requirements.txt (100%)
 rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/samconfig.toml (100%)
 rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/template.yml (100%)

diff --git a/plugins/dynamic-few-shot-lambda/.gitignore b/plugins/dynamic-few-shot-lambda/.gitignore
new file mode 100644
index 00000000..c0190e10
--- /dev/null
+++ b/plugins/dynamic-few-shot-lambda/.gitignore
@@ -0,0 +1 @@
+datasets
\ No newline at end of file
diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
similarity index 100%
rename from notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
rename to plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md
similarity index 100%
rename from notebooks/examples/dynamic-few-shot-lambda/README.md
rename to plugins/dynamic-few-shot-lambda/README.md
diff --git a/notebooks/examples/config/extraction_with_few_shot.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml
similarity index 100%
rename from notebooks/examples/config/extraction_with_few_shot.yaml
rename to plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml
diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb
similarity index 100%
rename from notebooks/misc/fewshot_dataset_import.ipynb
rename to plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb
diff --git a/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
similarity index 100%
rename from notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb
rename to plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
diff --git a/notebooks/examples/dynamic-few-shot-lambda/requirements.txt b/plugins/dynamic-few-shot-lambda/requirements.txt
similarity index 100%
rename from notebooks/examples/dynamic-few-shot-lambda/requirements.txt
rename to plugins/dynamic-few-shot-lambda/requirements.txt
diff --git a/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml b/plugins/dynamic-few-shot-lambda/samconfig.toml
similarity index 100%
rename from notebooks/examples/dynamic-few-shot-lambda/samconfig.toml
rename to plugins/dynamic-few-shot-lambda/samconfig.toml
diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
similarity index 100%
rename from notebooks/examples/dynamic-few-shot-lambda/template.yml
rename to plugins/dynamic-few-shot-lambda/template.yml

From 035b28c2e2b0fcbf493ee1caf30e4f7270bb1fcf Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 15:11:03 +0100
Subject: [PATCH 15/39] chore: ignore datasets folder

---
 plugins/dynamic-few-shot-lambda/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/.gitignore b/plugins/dynamic-few-shot-lambda/.gitignore
index c0190e10..f3c07f0d 100644
--- a/plugins/dynamic-few-shot-lambda/.gitignore
+++ b/plugins/dynamic-few-shot-lambda/.gitignore
@@ -1 +1 @@
-datasets
\ No newline at end of file
+datasets/
\ No newline at end of file

From 854fa8b23e9d3bd4a9ea9d612959bc1239850256 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 15:13:38 +0100
Subject: [PATCH 16/39] chore: ruff format

---
 .../GENAIIDP-dynamic-few-shot.py              | 63 +++++++++++--------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
index 495899bc..f2b6edb9 100644
--- a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
@@ -26,20 +26,21 @@
 
 # Parse environment variables with error handling
 try:
-    S3VECTOR_BUCKET = os.environ['S3VECTOR_BUCKET']
-    S3VECTOR_INDEX = os.environ['S3VECTOR_INDEX']
-    S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS'])
-    MODEL_ID = os.environ['MODEL_ID']
-    TOP_K = int(os.environ['TOP_K'])
-    THRESHOLD = float(os.environ['THRESHOLD'])
+    S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"]
+    S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"]
+    S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"])
+    MODEL_ID = os.environ["MODEL_ID"]
+    TOP_K = int(os.environ["TOP_K"])
+    THRESHOLD = float(os.environ["THRESHOLD"])
 except (KeyError, ValueError, IndexError) as e:
     logger.error(f"Failed to parse environment variables: {e}")
     raise
 
 # Initialize clients
-s3vectors = boto3.client('s3vectors')
+s3vectors = boto3.client("s3vectors")
 bedrock_client = bedrock.BedrockClient()
 
+
 def lambda_handler(event, context):
     """
     Process a document to find similar examples using S3 Vectors similarity search.
@@ -61,7 +62,7 @@ def lambda_handler(event, context):
         }
     ]
     """
-    
+
     try:
         logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===")
         logger.debug(f"Complete input event: {json.dumps(event, indent=2)}")
@@ -73,7 +74,9 @@ def lambda_handler(event, context):
 
         logger.info(f"=== INPUT VALUES ===")
         logger.info(f"Class label: {class_label if class_label else 'Not specified'}")
-        logger.info(f"Document text: {len(document_text) if document_text else "0"} bytes")
+        logger.info(
+            f"Document text: {len(document_text) if document_text else '0'} bytes"
+        )
         logger.info(f"Image content: {len(image_content)} images")
 
         # Decode input data
@@ -91,15 +94,18 @@ def lambda_handler(event, context):
 
         logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===")
         return result
-            
+
     except Exception as e:
         logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===")
         logger.error(f"Error type: {type(e).__name__}")
         logger.error(f"Error message: {str(e)}")
-        logger.error(f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}")
+        logger.error(
+            f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}"
+        )
         # In demo, we'll fail gracefully with detailed error info
         raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}")
 
+
 def _decode_images(image_content):
     """Base64 decode image content to bytes"""
     result = []
@@ -108,14 +114,16 @@ def _decode_images(image_content):
         result.append(image_data)
     return result
 
+
 def _encode_images(image_content):
     """Base64 encode image content to JSON-serializable string"""
     result = []
     for image_bytes in image_content:
-       image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-       result.append(image_base64)
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        result.append(image_base64)
     return result
 
+
 def _s3vectors_find_similar_items(image_data):
     """Find similar items for input"""
     # find similar items based on image similarity only
@@ -133,21 +141,21 @@ def _s3vectors_find_similar_items(image_data):
 
         # Only process this example if it has a non-empty attributesPrompt
         if not attributes_prompt or not attributes_prompt.strip():
-            logger.info(
-                f"Skipping example with empty attributesPrompt: {key}"
-            )
+            logger.info(f"Skipping example with empty attributesPrompt: {key}")
             continue
 
         attributes = _extract_metadata(metadata, distance)
         result.append(attributes)
 
     # sort results by distance score (lowest to highest - lower is more similar)
-    sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False)
+    sorted_result = sorted(
+        result, key=lambda example: example["distance"], reverse=False
+    )
 
     # filter result by distance score
     filtered_result = []
     for example in sorted_result:
-        if example['distance'] > THRESHOLD:
+        if example["distance"] > THRESHOLD:
             logger.info(
                 f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}"
             )
@@ -156,6 +164,7 @@ def _s3vectors_find_similar_items(image_data):
 
     return filtered_result
 
+
 def _s3vectors_find_similar_items_from_image(page_image):
     """Search for similar items using image query"""
     embedding = bedrock_client.generate_embedding(
@@ -169,11 +178,12 @@ def _s3vectors_find_similar_items_from_image(page_image):
         queryVector={"float32": embedding},
         topK=TOP_K,
         returnDistance=True,
-        returnMetadata=True
+        returnMetadata=True,
     )
     logger.debug(f"S3 vectors lookup result: {response['vectors']}")
     return response["vectors"]
 
+
 def _merge_examples(examples, new_examples):
     """
     Merge in-place new examples into the result list, avoiding duplicates.
@@ -185,7 +195,7 @@ def _merge_examples(examples, new_examples):
     for new_example in new_examples:
         key = new_example["key"]
         new_distance = new_example.get("distance", 1.0)
-        
+
         # update example
         if examples.get(key):
             existing_distance = examples[key].get("distance", 1.0)
@@ -195,9 +205,10 @@ def _merge_examples(examples, new_examples):
         else:
             examples[key] = {
                 "distance": new_distance,
-                "metadata": new_example.get("metadata")
+                "metadata": new_example.get("metadata"),
             }
 
+
 def _extract_metadata(metadata, distance):
     """Create result object from S3 vectors metadata"""
     # Result object attributes
@@ -215,6 +226,7 @@ def _extract_metadata(metadata, distance):
 
     return attributes
 
+
 def _get_image_data_from_s3_path(image_path):
     """
     Load images from image path
@@ -237,9 +249,7 @@ def _get_image_data_from_s3_path(image_path):
                 # Direct S3 URI
                 image_bytes = s3.get_binary_content(image_file_path)
             else:
-                raise ValueError(
-                    f"Invalid file path {image_path} - expecting S3 path"
-                )
+                raise ValueError(f"Invalid file path {image_path} - expecting S3 path")
 
             image_content.append(image_bytes)
         except Exception as e:
@@ -248,6 +258,7 @@ def _get_image_data_from_s3_path(image_path):
 
     return image_content
 
+
 def _get_image_files_from_s3_path(image_path):
     """
     Get list of image files from an S3 path.
@@ -260,9 +271,7 @@ def _get_image_files_from_s3_path(image_path):
     """
     # Handle S3 URIs
     if not image_path.startswith("s3://"):
-        raise ValueError(
-            f"Invalid file path {image_path} - expecting S3 URI"
-        )
+        raise ValueError(f"Invalid file path {image_path} - expecting S3 URI")
 
     # Check if it's a direct file or a prefix
     if image_path.endswith(

From b5f88732ac803a0e442115b557b7cf800826a968 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 23:38:43 +0100
Subject: [PATCH 17/39] feat: update dynamic-few-shot Lambda to implement
 Custom Prompt Lambda interface

---
 .../GENAIIDP-dynamic-few-shot.py              | 284 ------------
 plugins/dynamic-few-shot-lambda/README.md     |   6 +-
 .../dynamic-few-shot-lambda/requirements.txt  |   1 -
 .../src/GENAIIDP-dynamic-few-shot.py          | 416 ++++++++++++++++++
 .../src/requirements.txt                      |   1 +
 plugins/dynamic-few-shot-lambda/template.yml  |  21 +-
 6 files changed, 440 insertions(+), 289 deletions(-)
 delete mode 100644 plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
 delete mode 100644 plugins/dynamic-few-shot-lambda/requirements.txt
 create mode 100644 plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py
 create mode 100644 plugins/dynamic-few-shot-lambda/src/requirements.txt

diff --git a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
deleted file mode 100644
index f2b6edb9..00000000
--- a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-
-"""
-Lambda function to provide examples with ground truth data based on S3 Vectors lookup.
-
-Key Features Demonstrated:
-- Dynamically retrieve similar examples based on document content using vector similarity search
-- Provide few-shot examples to improve extraction accuracy through example-based prompting
-- Leverage S3 Vectors for efficient similarity search across large example datasets
-- Integrate multimodal embeddings using Amazon Nova models for image-based similarity
-- Customize example selection based on document characteristics and business rules
-"""
-
-import json
-import logging
-import base64
-import boto3
-import os
-
-from idp_common import bedrock, s3
-
-logger = logging.getLogger(__name__)
-level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO"))
-logger.setLevel(level)
-
-# Parse environment variables with error handling
-try:
-    S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"]
-    S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"]
-    S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"])
-    MODEL_ID = os.environ["MODEL_ID"]
-    TOP_K = int(os.environ["TOP_K"])
-    THRESHOLD = float(os.environ["THRESHOLD"])
-except (KeyError, ValueError, IndexError) as e:
-    logger.error(f"Failed to parse environment variables: {e}")
-    raise
-
-# Initialize clients
-s3vectors = boto3.client("s3vectors")
-bedrock_client = bedrock.BedrockClient()
-
-
-def lambda_handler(event, context):
-    """
-    Process a document to find similar examples using S3 Vectors similarity search.
-
-    Input event:
-    {
-        "class_label": "<class_label e.g. invoice>",
-        "document_text": "<document_text e.g. plain text or markdown from section 1 (pages 1-3)...>",
-        "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
-    }
-
-    Return format:
-    [
-        {
-            "attributes_prompt": "expected attributes are: ...",
-            "class_prompt": "This is an example of the class 'invoice'",
-            "distance": 0.122344521145,
-            "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
-        }
-    ]
-    """
-
-    try:
-        logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===")
-        logger.debug(f"Complete input event: {json.dumps(event, indent=2)}")
-
-        # Validate input
-        class_label = event.get("class_label")
-        document_text = event.get("document_text")
-        image_content = event.get("image_content", [])
-
-        logger.info(f"=== INPUT VALUES ===")
-        logger.info(f"Class label: {class_label if class_label else 'Not specified'}")
-        logger.info(
-            f"Document text: {len(document_text) if document_text else '0'} bytes"
-        )
-        logger.info(f"Image content: {len(image_content)} images")
-
-        # Decode input data
-        image_data = _decode_images(image_content)
-
-        logger.info(f"=== FIND SIMILAR ITEMS ===")
-
-        # Find similar items using S3 vectors lookup from image similarity
-        result = _s3vectors_find_similar_items(image_data)
-
-        # Log complete output structure
-        logger.info(f"=== OUTPUT ANALYSIS ===")
-        logger.debug(f"Complete result: {json.dumps(result, indent=2)}")
-        logger.info(f"Output items: {len(result)}")
-
-        logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===")
-        return result
-
-    except Exception as e:
-        logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===")
-        logger.error(f"Error type: {type(e).__name__}")
-        logger.error(f"Error message: {str(e)}")
-        logger.error(
-            f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}"
-        )
-        # In demo, we'll fail gracefully with detailed error info
-        raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}")
-
-
-def _decode_images(image_content):
-    """Base64 decode image content to bytes"""
-    result = []
-    for image_base64 in image_content:
-        image_data = base64.b64decode(image_base64)
-        result.append(image_data)
-    return result
-
-
-def _encode_images(image_content):
-    """Base64 encode image content to JSON-serializable string"""
-    result = []
-    for image_bytes in image_content:
-        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-        result.append(image_base64)
-    return result
-
-
-def _s3vectors_find_similar_items(image_data):
-    """Find similar items for input"""
-    # find similar items based on image similarity only
-    similar_items = {}
-    for page_image in image_data:
-        result = _s3vectors_find_similar_items_from_image(page_image)
-        _merge_examples(similar_items, result)
-
-    # create result set
-    result = []
-    for key, example in similar_items.items():
-        metadata = example.get("metadata", {})
-        distance = example.get("distance")
-        attributes_prompt = metadata.get("attributesPrompt")
-
-        # Only process this example if it has a non-empty attributesPrompt
-        if not attributes_prompt or not attributes_prompt.strip():
-            logger.info(f"Skipping example with empty attributesPrompt: {key}")
-            continue
-
-        attributes = _extract_metadata(metadata, distance)
-        result.append(attributes)
-
-    # sort results by distance score (lowest to highest - lower is more similar)
-    sorted_result = sorted(
-        result, key=lambda example: example["distance"], reverse=False
-    )
-
-    # filter result by distance score
-    filtered_result = []
-    for example in sorted_result:
-        if example["distance"] > THRESHOLD:
-            logger.info(
-                f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}"
-            )
-        else:
-            filtered_result.append(example)
-
-    return filtered_result
-
-
-def _s3vectors_find_similar_items_from_image(page_image):
-    """Search for similar items using image query"""
-    embedding = bedrock_client.generate_embedding(
-        image_source=page_image,
-        model_id=MODEL_ID,
-        dimensions=S3VECTOR_DIMENSIONS,
-    )
-    response = s3vectors.query_vectors(
-        vectorBucketName=S3VECTOR_BUCKET,
-        indexName=S3VECTOR_INDEX,
-        queryVector={"float32": embedding},
-        topK=TOP_K,
-        returnDistance=True,
-        returnMetadata=True,
-    )
-    logger.debug(f"S3 vectors lookup result: {response['vectors']}")
-    return response["vectors"]
-
-
-def _merge_examples(examples, new_examples):
-    """
-    Merge in-place new examples into the result list, avoiding duplicates.
-
-    Args:
-        examples: Dict of existing examples
-        new_examples: List of new examples to be merged
-    """
-    for new_example in new_examples:
-        key = new_example["key"]
-        new_distance = new_example.get("distance", 1.0)
-
-        # update example
-        if examples.get(key):
-            existing_distance = examples[key].get("distance", 1.0)
-            examples[key]["distance"] = min(new_distance, existing_distance)
-            examples[key]["metadata"] = new_example.get("metadata")
-        # insert example
-        else:
-            examples[key] = {
-                "distance": new_distance,
-                "metadata": new_example.get("metadata"),
-            }
-
-
-def _extract_metadata(metadata, distance):
-    """Create result object from S3 vectors metadata"""
-    # Result object attributes
-    attributes = {
-        "attributes_prompt": metadata.get("attributesPrompt"),
-        "class_prompt": metadata.get("classPrompt"),
-        "distance": distance,
-    }
-
-    image_path = metadata.get("imagePath")
-    if image_path:
-        image_data = _get_image_data_from_s3_path(image_path)
-        encoded_images = _encode_images(image_data)
-        attributes["image_content"] = encoded_images
-
-    return attributes
-
-
-def _get_image_data_from_s3_path(image_path):
-    """
-    Load images from image path
-
-    Args:
-        image_path: Path to image file, directory, or S3 prefix
-
-    Returns:
-        List of images (bytes)
-    """
-    # Get list of image files from the path (supports directories/prefixes)
-    image_files = _get_image_files_from_s3_path(image_path)
-    image_content = []
-
-    # Process each image file
-    for image_file_path in image_files:
-        try:
-            # Load image content
-            if image_file_path.startswith("s3://"):
-                # Direct S3 URI
-                image_bytes = s3.get_binary_content(image_file_path)
-            else:
-                raise ValueError(f"Invalid file path {image_path} - expecting S3 path")
-
-            image_content.append(image_bytes)
-        except Exception as e:
-            logger.warning(f"Failed to load image {image_file_path}: {e}")
-            continue
-
-    return image_content
-
-
-def _get_image_files_from_s3_path(image_path):
-    """
-    Get list of image files from an S3 path.
-
-    Args:
-        image_path: Path to image file, directory, or S3 prefix
-
-    Returns:
-        List of image file paths/URIs sorted by filename
-    """
-    # Handle S3 URIs
-    if not image_path.startswith("s3://"):
-        raise ValueError(f"Invalid file path {image_path} - expecting S3 URI")
-
-    # Check if it's a direct file or a prefix
-    if image_path.endswith(
-        (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")
-    ):
-        # Direct S3 file
-        return [image_path]
-    else:
-        # S3 prefix - list all images
-        return s3.list_images_from_path(image_path)
diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md
index 098be753..a400f61a 100644
--- a/plugins/dynamic-few-shot-lambda/README.md
+++ b/plugins/dynamic-few-shot-lambda/README.md
@@ -67,7 +67,7 @@ flowchart TD
 
 ```bash
 # Navigate to the dynamic-few-shot-lambda directory
-cd notebooks/examples/dynamic-few-shot-lambda
+cd plugins/dynamic-few-shot-lambda
 
 # Deploy using AWS SAM
 sam deploy --guided
@@ -86,7 +86,7 @@ aws cloudformation describe-stacks \
 
 ### Step 3: Populate the Examples Dataset
 
-Use the [fewshot_dataset_import.ipynb](../../misc/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack.
+Use the [fewshot_dataset_import.ipynb](notebooks/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack.
 
 ### Step 4: Configure IDP to Use Dynamic-few shot
 
@@ -94,7 +94,7 @@ Add the Lambda ARN to your IDP extraction configuration:
 
 ```yaml
 extraction:
-  dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
+  custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
 ```
 
 ## Lambda Interface
diff --git a/plugins/dynamic-few-shot-lambda/requirements.txt b/plugins/dynamic-few-shot-lambda/requirements.txt
deleted file mode 100644
index 2048c02c..00000000
--- a/plugins/dynamic-few-shot-lambda/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-../../../lib/idp_common_pkg[extraction,docs_service]  # extraction module and document service with dependencies
diff --git a/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py
new file mode 100644
index 00000000..49aab6c3
--- /dev/null
+++ b/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py
@@ -0,0 +1,416 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+"""
+Lambda function to provide examples with ground truth data based on S3 Vectors lookup.
+
+Key Features Demonstrated:
+- Dynamically retrieve similar examples based on document content using vector similarity search
+- Provide few-shot examples to improve extraction accuracy through example-based prompting
+- Leverage S3 Vectors for efficient similarity search across large example datasets
+- Integrate multimodal embeddings using Amazon Nova models for image-based similarity
+- Customize example selection based on document characteristics and business rules
+"""
+
+import json
+import logging
+import base64
+import boto3
+import os
+
+from idp_common import bedrock, s3
+from idp_common.bedrock import format_prompt
+
+from typing import Any
+
+logger = logging.getLogger(__name__)
+level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO"))
+logger.setLevel(level)
+
+# Parse environment variables with error handling
+try:
+    S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"]
+    S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"]
+    S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"])
+    MODEL_ID = os.environ["MODEL_ID"]
+    TOP_K = int(os.environ["TOP_K"])
+    THRESHOLD = float(os.environ["THRESHOLD"])
+except (KeyError, ValueError, IndexError) as e:
+    logger.error(f"Failed to parse environment variables: {e}")
+    raise
+
+# Initialize clients
+s3vectors = boto3.client("s3vectors")
+bedrock_client = bedrock.BedrockClient()
+
+
+def lambda_handler(event, context):
+    """
+    Process a document to find similar examples using S3 Vectors similarity search.
+    This function will expand {FEW_SHOT_EXAMPLES} in the extraction prompt to examples
+    found in S3 Vectors lookup.
+    """
+
+    try:
+        logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===")
+        logger.debug(f"Complete input event: {json.dumps(event, indent=2)}")
+
+        # Extract key information from the payload
+        config = event.get("config", {})
+        placeholders = event.get("prompt_placeholders", {})
+        default_content = event.get("default_task_prompt_content", [])
+        document = event.get("serialized_document", {})
+               
+        document_class = placeholders.get("DOCUMENT_CLASS", "")
+        document_text = placeholders.get("DOCUMENT_TEXT", "")
+        document_image_uris = placeholders.get("DOCUMENT_IMAGE", [])
+        document_id = document.get("id", "unknown")
+
+        # Log extraction config details
+        extraction_config = config.get("extraction", {})
+        logger.info(f"=== EXTRACTION CONFIG ===")
+        logger.info(f"Model: {extraction_config.get('model', 'Not specified')}")
+        logger.info(f"Temperature: {extraction_config.get('temperature', 'Not specified')}")
+        logger.info(f"Max tokens: {extraction_config.get('max_tokens', 'Not specified')}")
+        logger.info(f"Custom Lambda ARN: {extraction_config.get('custom_prompt_lambda_arn', 'Not specified')}")
+        
+        # Default system prompt from config
+        default_system_prompt = config.get("extraction", {}).get("system_prompt", "")
+        logger.info(f"Default system prompt length: {len(default_system_prompt)} characters")
+        default_task_prompt = config.get("extraction", {}).get("task_prompt", "")
+        logger.info(f"Default task prompt length: {len(default_task_prompt)} characters")
+
+        logger.info(f"=== HANDLE INPUT DOCUMENT ===")
+
+        # Handle input document
+        result = _handle_input_document(placeholders, default_system_prompt, default_task_prompt)
+
+        # Log complete output structure
+        logger.info(f"=== OUTPUT ANALYSIS ===")
+        logger.info(f"Output keys: {list(result.keys())}")
+        logger.info(f"System prompt length: {len(result.get('system_prompt', ''))}")
+        logger.info(f"System prompt (first 200 chars): {result.get('system_prompt', '')[:200]}...")
+        
+        task_content = result.get('task_prompt_content', [])
+        logger.info(f"Task prompt content items: {len(task_content)}")
+        for i, item in enumerate(task_content[:3]):  # Log first 3 items
+            logger.info(f"Content item {i}: keys={list(item.keys())}")
+            if 'text' in item:
+                logger.info(f"  Text length: {len(item['text'])} characters")
+                logger.info(f"  Text sample (first 150 chars): {item['text'][:150]}...")
+            if 'image_uri' in item:
+                logger.info(f"  Image URI: {item['image_uri']}")
+        
+        if len(task_content) > 3:
+            logger.info(f"  ... and {len(task_content) - 3} more content items")
+
+        logger.debug(f"Complete result output: {json.dumps(result, indent=2)}")
+        logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===")
+        return result
+
+    except Exception as e:
+        logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===")
+        logger.error(f"Error type: {type(e).__name__}")
+        logger.error(f"Error message: {str(e)}")
+        logger.error(
+            f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}"
+        )
+        # In demo, we'll fail gracefully with detailed error info
+        raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}")
+
+def _handle_input_document(placeholders, default_system_prompt, default_task_prompt):
+    """
+    Handle input request and return custom system_prompt and task_prompt_content
+    """
+    substitutions = {
+        "DOCUMENT_TEXT": placeholders.get("DOCUMENT_TEXT"),
+        "DOCUMENT_CLASS": placeholders.get("DOCUMENT_CLASS"),
+        "ATTRIBUTE_NAMES_AND_DESCRIPTIONS": placeholders.get("ATTRIBUTE_NAMES_AND_DESCRIPTIONS")
+    }
+    task_prompt_content = _build_prompt_content(
+        default_task_prompt, substitutions, placeholders.get("DOCUMENT_IMAGE")
+    )
+
+    return {
+        "system_prompt": default_system_prompt,
+        "task_prompt_content": task_prompt_content
+    }
+
+
+def _build_prompt_content(
+    prompt_template: str,
+    substitutions: dict[str, Any],
+    image_content: Any = None,
+) -> list[dict[str, Any]]:
+    """
+    Build prompt content array handling FEW_SHOT_EXAMPLES and DOCUMENT_IMAGE placeholders.
+
+    This consolidated method handles all placeholder types and combinations:
+    - {FEW_SHOT_EXAMPLES}: Inserts few-shot examples from config
+    - {DOCUMENT_IMAGE}: Inserts images at specific location
+    - Regular text placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc.
+
+    Args:
+        prompt_template: The prompt template with optional placeholders
+        substitutions: Dictionary of placeholder values
+        image_content: Optional image content to insert (only used with {DOCUMENT_IMAGE})
+
+    Returns:
+        List of content items with text and image content properly ordered
+    """
+    content: list[dict[str, Any]] = []
+
+    # Handle FEW_SHOT_EXAMPLES placeholder first
+    if "{FEW_SHOT_EXAMPLES}" in prompt_template:
+        parts = prompt_template.split("{FEW_SHOT_EXAMPLES}")
+        if len(parts) == 2:
+            # Process before examples
+            content.extend(
+                _build_text_and_image_content(parts[0], substitutions, image_content)
+            )
+
+            # Add few-shot examples
+            content.extend(_build_few_shot_examples_content(image_content))
+
+            # Process after examples (only pass images if not already used)
+            image_for_after = (
+                None if "{DOCUMENT_IMAGE}" in parts[0] else image_content
+            )
+            content.extend(
+                _build_text_and_image_content(parts[1], substitutions, image_for_after)
+            )
+
+            return content
+
+    # No FEW_SHOT_EXAMPLES, just handle text and images
+    logger.warn("Missing {FEW_SHOT_EXAMPLES} placeholder in prompt template")
+    return _build_text_and_image_content(prompt_template, substitutions, image_content)
+
+
+def _build_text_and_image_content(
+    prompt_template: str,
+    substitutions: dict[str, Any],
+    image_content: Any = None,
+) -> list[dict[str, Any]]:
+    """
+    Build content array with text and optionally images based on DOCUMENT_IMAGE placeholder.
+
+    Args:
+        prompt_template: Template that may contain {DOCUMENT_IMAGE}
+        substitutions: Dictionary of placeholder values
+        image_content: Optional image content
+
+    Returns:
+        List of content items
+    """
+    content: list[dict[str, Any]] = []
+
+    if "{DOCUMENT_IMAGE}" in prompt_template:
+        parts = prompt_template.split("{DOCUMENT_IMAGE}")
+        if len(parts) == 2:
+            # Add text before image
+            before_text = _prepare_prompt_from_template(
+                parts[0], substitutions, required_placeholders=[]
+            )
+            if before_text.strip():
+                content.append({"text": before_text})
+
+            # Add images
+            if image_content:
+                for image_uri in image_content:
+                    content.append({"image_uri": image_uri})
+
+            # Add text after image
+            after_text = _prepare_prompt_from_template(
+                parts[1], substitutions, required_placeholders=[]
+            )
+            if after_text.strip():
+                content.append({"text": after_text})
+
+            return content
+        else:
+            logger.warning("Invalid DOCUMENT_IMAGE placeholder usage")
+
+    # No image placeholder, just text
+    task_prompt = _prepare_prompt_from_template(
+        prompt_template, substitutions, required_placeholders=[]
+    )
+    content.append({"text": task_prompt})
+
+    return content
+
+
+def _build_few_shot_examples_content(image_content: Any = None) -> list[dict[str, Any]]:
+    """
+    Build content items for few-shot examples from the configuration for a specific class.
+
+    Args:
+        image_content: Optional document image content
+
+    Returns:
+        List of content items containing text and image content for examples
+    """
+    content: list[dict[str, Any]] = []
+
+    image_data = []
+    if image_content:
+        for image_uri in image_content:
+            # Load image content
+            if image_uri.startswith("s3://"):
+                # Direct S3 URI
+                image_bytes = s3.get_binary_content(image_uri)
+            else:
+                raise ValueError(f"Invalid file path {image_path} - expecting S3 path")
+
+            image_data.append(image_bytes)
+
+    examples = _s3vectors_find_similar_items(image_data)
+    for example in examples:
+        content.append({"text": example.get("attributesPrompt")})
+
+        for image_uri in example.get("imageFiles", []):
+            content.append({"image_uri": image_uri})
+
+    return content
+
+
+def _prepare_prompt_from_template(prompt_template, substitutions, required_placeholders):
+    """
+    Prepare prompt from template by replacing placeholders with values.
+
+    Args:
+        prompt_template: The prompt template with placeholders
+        substitutions: Dictionary of placeholder values
+        required_placeholders: List of placeholder names that must be present in the template
+
+    Returns:
+        String with placeholders replaced by values
+
+    Raises:
+        ValueError: If a required placeholder is missing from the template
+    """
+
+    return format_prompt(prompt_template, substitutions, required_placeholders)
+
+
+def _s3vectors_find_similar_items(image_data):
+    """Find similar items for input"""
+    # find similar items based on image similarity only
+    similar_items = {}
+    for page_image in image_data:
+        result = _s3vectors_find_similar_items_from_image(page_image)
+        _merge_examples(similar_items, result)
+
+    # create result set
+    result = []
+    for key, example in similar_items.items():
+        metadata = example.get("metadata", {})
+        distance = example.get("distance")
+        attributes_prompt = metadata.get("attributesPrompt")
+
+        # Only process this example if it has a non-empty attributesPrompt
+        if not attributes_prompt or not attributes_prompt.strip():
+            logger.info(f"Skipping example with empty attributesPrompt: {key}")
+            continue
+
+        attributes = _extract_metadata(metadata, distance)
+        result.append(attributes)
+
+    # sort results by distance score (lowest to highest - lower is more similar)
+    sorted_result = sorted(
+        result, key=lambda example: example["distance"], reverse=False
+    )
+
+    # filter result by distance score
+    filtered_result = []
+    for example in sorted_result:
+        if example["distance"] > THRESHOLD:
+            logger.info(
+                f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}"
+            )
+        else:
+            filtered_result.append(example)
+
+    return filtered_result
+
+
+def _s3vectors_find_similar_items_from_image(page_image):
+    """Search for similar items using image query"""
+    embedding = bedrock_client.generate_embedding(
+        image_source=page_image,
+        model_id=MODEL_ID,
+        dimensions=S3VECTOR_DIMENSIONS,
+    )
+    response = s3vectors.query_vectors(
+        vectorBucketName=S3VECTOR_BUCKET,
+        indexName=S3VECTOR_INDEX,
+        queryVector={"float32": embedding},
+        topK=TOP_K,
+        returnDistance=True,
+        returnMetadata=True,
+    )
+    logger.debug(f"S3 vectors lookup result: {response['vectors']}")
+    return response["vectors"]
+
+
+def _merge_examples(examples, new_examples):
+    """
+    Merge in-place new examples into the result list, avoiding duplicates.
+
+    Args:
+        examples: Dict of existing examples
+        new_examples: List of new examples to be merged
+    """
+    for new_example in new_examples:
+        key = new_example["key"]
+        new_distance = new_example.get("distance", 1.0)
+
+        # update example
+        if examples.get(key):
+            existing_distance = examples[key].get("distance", 1.0)
+            examples[key]["distance"] = min(new_distance, existing_distance)
+            examples[key]["metadata"] = new_example.get("metadata")
+        # insert example
+        else:
+            examples[key] = {
+                "distance": new_distance,
+                "metadata": new_example.get("metadata"),
+            }
+
+
+def _extract_metadata(metadata, distance):
+    """Create result object from S3 vectors metadata"""
+    # Result object attributes
+    attributes = {
+        "attributesPrompt": metadata.get("attributesPrompt"),
+        "classPrompt": metadata.get("classPrompt"),
+        "imageFiles": _get_image_files_from_s3_path(metadata.get("imagePath")),
+        "distance": distance,
+    }
+
+    return attributes
+
+
+def _get_image_files_from_s3_path(image_path):
+    """
+    Get list of image files from an S3 path.
+
+    Args:
+        image_path: Path to image file, directory, or S3 prefix
+
+    Returns:
+        List of image file paths/URIs sorted by filename
+    """
+    # Handle S3 URIs
+    if not image_path.startswith("s3://"):
+        raise ValueError(f"Invalid file path {image_path} - expecting S3 URI")
+
+    # Check if it's a direct file or a prefix
+    if image_path.endswith(
+        (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")
+    ):
+        # Direct S3 file
+        return [image_path]
+    else:
+        # S3 prefix - list all images
+        return s3.list_images_from_path(image_path)
diff --git a/plugins/dynamic-few-shot-lambda/src/requirements.txt b/plugins/dynamic-few-shot-lambda/src/requirements.txt
new file mode 100644
index 00000000..77b716ca
--- /dev/null
+++ b/plugins/dynamic-few-shot-lambda/src/requirements.txt
@@ -0,0 +1 @@
+../../lib/idp_common_pkg[extraction,docs_service]  # extraction module and document service with dependencies
diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 2c5158da..8646df10 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -46,6 +46,14 @@ Parameters:
     Type: String
     Default: "GENAIIDP-dynamic-few-shot"
 
+  GenAIIDPS3OutputBucketName:
+    Type: String
+    Description: "GenAIIDP S3OutputBucketName"
+
+  GenAIIDPCustomerManagedEncryptionKeyArn:
+    Type: String
+    Description: "GenAIIDP CustomerManagedEncryptionKey ARN"
+
 Conditions:
   HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]]
 
@@ -69,7 +77,7 @@ Resources:
     Properties:
       FunctionName: !Ref LambdaFunctionName
       PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue]
-      CodeUri: ./
+      CodeUri: ./src
       Handler: GENAIIDP-dynamic-few-shot.lambda_handler
       Runtime: python3.12
       Architectures:
@@ -91,6 +99,8 @@ Resources:
       # Minimal permissions - only needs basic execution and logging
       Policies:
         - AWSLambdaBasicExecutionRole
+        - S3ReadPolicy:
+            BucketName: !Ref GenAIIDPS3OutputBucketName
         - S3ReadPolicy:
             BucketName: !Ref DynamicFewShotDatasetBucket
         - Statement:
@@ -110,6 +120,15 @@ Resources:
                 - s3vectors:QueryVectors
               Resource:
                 - !Ref DynamicFewShotVectorIndex
+            - Effect: Allow
+              Action:
+                - kms:Encrypt
+                - kms:Decrypt
+                - kms:ReEncrypt*
+                - kms:GenerateDataKey*
+                - kms:DescribeKey
+              Resource:
+                - !Ref GenAIIDPCustomerManagedEncryptionKeyArn
 
   DynamicFewShotLogGroup:
     Type: AWS::Logs::LogGroup

From 4cb63fce032e6b0c20bbb0a350ae33bd35b35049 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Thu, 11 Dec 2025 23:40:24 +0100
Subject: [PATCH 18/39] chore: configurable LOG_LEVEL

---
 plugins/dynamic-few-shot-lambda/template.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 8646df10..168ea957 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -46,6 +46,10 @@ Parameters:
     Type: String
     Default: "GENAIIDP-dynamic-few-shot"
 
+  LogLevel:
+    Type: String
+    Default: INFO
+
   GenAIIDPS3OutputBucketName:
     Type: String
     Description: "GenAIIDP S3OutputBucketName"
@@ -87,7 +91,7 @@ Resources:
       Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting
       Environment:
         Variables:
-          LOG_LEVEL: INFO
+          LOG_LEVEL: !Ref LogLevel
           S3VECTOR_BUCKET: !Ref VectorBucketName
           S3VECTOR_INDEX: !Ref VectorIndexName
           S3VECTOR_DIMENSIONS: !Ref VectorDimensions

From 21c9855343cbcb932aeddab1bded2c6b140c5822 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:27:15 +0100
Subject: [PATCH 19/39] feat: convert image_uri to image bytes from custom
 lambda invocation

---
 .../idp_common/extraction/service.py          | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py
index 3cd83a9f..ead4e0a1 100644
--- a/lib/idp_common_pkg/idp_common/extraction/service.py
+++ b/lib/idp_common_pkg/idp_common/extraction/service.py
@@ -433,6 +433,46 @@ def _make_json_serializable(self, obj: Any) -> Any:
                 # Convert non-serializable objects to string representation
                 return str(obj)
 
+    def _convert_image_uris_to_bytes_in_content(
+        self, content: list[dict[str, Any]]
+    ) -> list[dict[str, Any]]:
+        """
+        Convert image URIs back to bytes in content array after Lambda processing.
+
+        Args:
+            content: Content array from Lambda that may contain image URIs
+
+        Returns:
+            Content array with image bytes restored
+        """
+        converted_content = []
+
+        for item in content:
+            if "image_uri" in item:
+                image_uri = item["image_uri"]
+
+                # Load image content
+                if image_uri.startswith("s3://"):
+                    # Direct S3 URI
+                    logger.info(f"Retrieving image {image_uri}")
+                    image_bytes = s3.get_binary_content(image_uri)
+                else:
+                    raise ValueError(
+                        f"Invalid file path {image_uri} - expecting S3 path"
+                    )
+
+                converted_item = image.prepare_bedrock_image_attachment(image_bytes)
+            elif "image" in item:
+                # Keep existing image objects as-is
+                converted_item = item.copy()
+            else:
+                # Keep non-image items as-is
+                converted_item = item.copy()
+
+            converted_content.append(converted_item)
+
+        return converted_content
+
     def _invoke_custom_prompt_lambda(
         self, lambda_arn: str, payload: dict[str, Any]
     ) -> dict[str, Any]:
@@ -486,6 +526,13 @@ def _invoke_custom_prompt_lambda(
                 logger.error(error_msg)
                 raise Exception(error_msg)
 
+            # Convert image URIs to bytes in the response
+            result["task_prompt_content"] = (
+                self._convert_image_uris_to_bytes_in_content(
+                    result["task_prompt_content"]
+                )
+            )
+
             return result
 
         except Exception as e:

From f99467cc84726ae11c3f3c35b3815a3ffe1b2cda Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:28:54 +0100
Subject: [PATCH 20/39] chore: use working bucket from GenAIIDP for dataset +
 adapt threshold

---
 plugins/dynamic-few-shot-lambda/README.md    | 261 +++++++++++++------
 plugins/dynamic-few-shot-lambda/template.yml |  30 +--
 2 files changed, 183 insertions(+), 108 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md
index a400f61a..d38e5384 100644
--- a/plugins/dynamic-few-shot-lambda/README.md
+++ b/plugins/dynamic-few-shot-lambda/README.md
@@ -1,64 +1,77 @@
-# Dynamic-Few Shot Prompting - Complete Guide
+# Dynamic Few-Shot Prompting Lambda - Complete Guide
 
-This directory contains the **complete implementation and demonstration** of the dynamic-few shot prompting feature for GenAI IDP Accelerator. This feature enables users to dynamically retrieve few-shot examples using S3 Vectors similarity search to improve extraction accuracy for Pattern 2.
+This directory contains the **complete implementation** of the dynamic few-shot prompting Lambda function for GenAI IDP Accelerator. This Lambda function integrates with Pattern 2 extraction as a custom prompt generator, dynamically retrieving similar examples using S3 Vectors similarity search to improve extraction accuracy.
 
 ## 🎯 Overview
 
-The dynamic-few shot prompting feature allows you to:
+The dynamic few-shot prompting Lambda function allows you to:
 
 - **Dynamically retrieve similar examples** based on document content using vector similarity search
-- **Provide few-shot examples** to improve extraction accuracy through example-based prompting
+- **Automatically inject few-shot examples** into extraction prompts using the `{FEW_SHOT_EXAMPLES}` placeholder
 - **Leverage S3 Vectors** for efficient similarity search across large example datasets
 - **Integrate multimodal embeddings** using Amazon Nova models for image-based similarity
-- **Customize example selection** based on document characteristics and business rules
+- **Seamlessly integrate** with existing IDP extraction workflows as a custom prompt Lambda
 
 ## 📁 Files in This Directory
 
-- **`GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup
-- **`template.yml`** - CloudFormation SAM template to deploy the complete stack
-- **`requirements.txt`** - Python dependencies for the Lambda function
+- **`src/GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup
+- **`src/requirements.txt`** - Python dependencies for the Lambda function
+- **`template.yml`** - CloudFormation SAM template to deploy the Lambda function
 - **`README.md`** - This comprehensive documentation and guide
 
 ## 🏗️ Architecture
 
 ```mermaid
 flowchart TD
-    A[Document Processing] --> B{Dynamic-few shot configured?}
-    B -->|No| C[Use Default Extraction]
-    B -->|Yes| D[Invoke Dynamic-few shot Lambda]
-
-    subgraph Lambda
-        D --> E[Receive Document Images]
-        E --> F[Generate Embeddings with Nova]
-        F --> G[Query S3 Vectors Index]
-        G --> H[Retrieve Similar Examples]
-        H --> I[Load Example Images from S3]
-        I --> J[Format Examples for Bedrock]
+    A[IDP Document Processing] --> B{Custom Prompt Lambda ARN configured?}
+    B -->|No| C[Use Default Task Prompt]
+    B -->|Yes| D[Invoke Dynamic Few-Shot Lambda]
+
+    subgraph "Lambda Function: GENAIIDP-dynamic-few-shot"
+        D --> E[Receive IDP Context & Placeholders]
+        E --> F[Extract Document Images from DOCUMENT_IMAGE]
+        F --> G[Generate Nova Multimodal Embeddings]
+        G --> H[Query S3 Vectors Index]
+        H --> I[Filter by Distance Threshold]
+        I --> J[Merge & Deduplicate Results]
+        J --> K[Load Example Images from S3]
+        K --> L[Build Prompt Content Array]
+        L --> M[Replace FEW_SHOT_EXAMPLES Placeholder]
     end
 
-    J --> K[Use Examples in Extraction Prompt]
-    C --> L[Continue with Standard Extraction]
-    K --> L
+    M --> N[Return Modified Task Prompt Content]
+    C --> O[Continue with Bedrock Extraction]
+    N --> O
 
-    subgraph Input
-        M[Document Class]
-        N[Document Text]
-        O[Document Images]
+    subgraph "Input Payload"
+        P[config: IDP Configuration]
+        Q[prompt_placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc.]
+        R[default_task_prompt_content: Original prompt]
+        S[serialized_document: Document metadata]
     end
 
-    subgraph Output
-        P[Example Attributes Prompts]
-        Q[Example Images]
-        R[Similarity Distances]
+    subgraph "Output Payload"
+        T[system_prompt: Unchanged]
+        U[task_prompt_content: Array with Prompt segments and Example images]
     end
 
-    D -.-> M
-    D -.-> N
-    D -.-> O
+    D -.-> P
+    D -.-> Q
+    D -.-> R
+    D -.-> S
 
-    J -.-> P
-    J -.-> Q
-    J -.-> R
+    N -.-> T
+    N -.-> U
+
+    subgraph "S3 Vectors Infrastructure"
+        X[Vector Bucket: Encrypted storage]
+        Y[Vector Index: 3072-dim cosine similarity]
+        Z[Metadata: classPrompt, attributesPrompt, imagePath]
+    end
+
+    H -.-> X
+    H -.-> Y
+    H -.-> Z
 ```
 
 ## Quick Start
@@ -88,7 +101,7 @@ aws cloudformation describe-stacks \
 
 Use the [fewshot_dataset_import.ipynb](notebooks/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack.
 
-### Step 4: Configure IDP to Use Dynamic-few shot
+### Step 4: Configure IDP to Use Dynamic Few-Shot
 
 Add the Lambda ARN to your IDP extraction configuration:
 
@@ -97,42 +110,81 @@ extraction:
   custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
 ```
 
+**Important**: Your extraction task prompt must include the `{FEW_SHOT_EXAMPLES}` placeholder where you want the dynamic examples to be inserted.
+
+### Step 5: Run the Demo Notebook
+
+0. Run `notebooks/examples` steps 0, 1, 2
+1. Open `plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_custom_lambda.ipynb`
+2. Run all cells to see the comparison
+
 ## Lambda Interface
 
 ### Input Payload Structure
+
+The Lambda receives the full IDP context as a custom prompt Lambda:
+
 ```json
 {
-  "class_label": "invoice",
-  "document_text": "Text or markdown from section 1 (pages 1-3)...",
-  "image_content": [
-    "base64_encoded_image_1",
-    "base64_encoded_image_2"
-  ]
+  "config": {
+    "extraction": {...},
+    "classes": [...],
+    ...
+  },
+  "prompt_placeholders": {
+    "DOCUMENT_TEXT": "Full OCR text from all pages",
+    "DOCUMENT_CLASS": "invoice", 
+    "ATTRIBUTE_NAMES_AND_DESCRIPTIONS": "LineItems: List of line items in the invoice...",
+    "DOCUMENT_IMAGE": ["s3://bucket/document/page1.jpg", "s3://bucket/document/page2.jpg"]
+  },
+  "default_task_prompt_content": [
+    {"text": "Resolved default task prompt..."},
+    {"image_uri": "s3://..."}, // if images present
+    {"cachePoint": true} // if cache points present
+  ],
+  "serialized_document": {
+    "id": "document-123",
+    "input_bucket": "my-bucket",
+    "pages": {...},
+    "sections": [...],
+    ...
+  }
 }
 ```
 
 ### Output Payload Structure
+
+The Lambda returns modified prompt content with dynamic few-shot examples:
+
 ```json
-[
-  {
-    "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...",
-    "class_prompt": "This is an example of the class 'invoice'",
-    "distance": 0.122344521145, # lower is more similar
-    "image_content": ["<base64_image_content_1>", "<base64_image_content_2>", ...]
-  }
-]
+{
+  "system_prompt": "Custom system prompt text",
+  "task_prompt_content": [
+    {"text": "Extract the following attributes from this invoice document:\n\nLineItems: List of line items in the invoice...\n\n<few_shot_examples>"},
+    {"text": "expected attributes are:\n    \"invoice_number\": \"INV-2024-001\",\n    \"total_amount\": \"$1,250.00\""},
+    {"image_uri": "s3://examples-bucket/invoices/example-001/page1.jpg"},
+    {"text": "</few_shot_examples>\n\n<<CACHEPOINT>>\n\nDocument content:\nINVOICE\nInvoice #: INV-2024-002..."}
+  ]
+}
 ```
 
 ## Core Functionality
 
-### 1. Vector Similarity Search
+### 1. Custom Prompt Integration
+
+The Lambda integrates with IDP's custom prompt system by:
+- Receiving the full extraction context and configuration
+- Processing the `{FEW_SHOT_EXAMPLES}` placeholder in task prompts
+- Returning modified prompt content with dynamically retrieved examples
+
+### 2. Vector Similarity Search
 
 The Lambda uses Amazon Nova multimodal embeddings to find similar examples:
 
 ```python
 # Generate embedding from document image
-embedding = bedrock.generate_embedding(
-    image_source=image_data,
+embedding = bedrock_client.generate_embedding(
+    image_source=page_image,
     model_id=MODEL_ID,
     dimensions=S3VECTOR_DIMENSIONS,
 )
@@ -148,34 +200,36 @@ response = s3vectors.query_vectors(
 )
 ```
 
-### 2. Example Merging and Deduplication
+### 3. Example Merging and Deduplication
 
 Multiple document images are processed and results are merged to avoid duplicates:
 
 ```python
-def merge_examples(combined_examples, new_examples):
+def _merge_examples(examples, new_examples):
     """Merge examples, keeping the best similarity score for duplicates"""
     for new_example in new_examples:
         key = new_example["key"]
-        if combined_examples.get(key):
-            # Keep the better (lower) distance score
-            combined_examples[key]["distance"] = min(
-                new_example.get("distance"),
-                combined_examples[key]["distance"]
-            )
+        new_distance = new_example.get("distance", 1.0)
+        
+        if examples.get(key):
+            existing_distance = examples[key].get("distance", 1.0)
+            examples[key]["distance"] = min(new_distance, existing_distance)
 ```
 
-### 3. Example Image Loading
+### 4. Prompt Content Building
 
-The Lambda loads example images from S3 paths stored in vector metadata:
+The Lambda builds structured prompt content handling multiple placeholders:
 
 ```python
-def get_image_files_from_s3_path(image_path: str) -> List[str]:
-    """Get list of image files from S3 path or prefix"""
-    if image_path.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")):
-        return [image_path]  # Direct file
-    else:
-        return s3.list_images_from_path(image_path)  # Directory/prefix
+def _build_prompt_content(prompt_template, substitutions, image_content):
+    """
+    Build prompt content array handling FEW_SHOT_EXAMPLES and DOCUMENT_IMAGE placeholders.
+    
+    Handles:
+    - {FEW_SHOT_EXAMPLES}: Inserts few-shot examples from S3 Vectors
+    - {DOCUMENT_IMAGE}: Inserts images at specific location
+    - Regular text placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc.
+    """
 ```
 
 ## Configuration
@@ -188,7 +242,9 @@ The Lambda function uses these environment variables (set by the CloudFormation
 - `S3VECTOR_INDEX` - Name of the S3 Vectors index
 - `S3VECTOR_DIMENSIONS` - Embedding dimensions (e.g. `3072` for Nova Multimodal Embedding model)
 - `MODEL_ID` - Bedrock model ID for embeddings (e.g. `amazon.nova-2-multimodal-embeddings-v1:0`)
-- `TOP_K` - Number of similar examples to retrieve
+- `TOP_K` - Number of similar examples to retrieve (default: 3)
+- `THRESHOLD` - Maximum distance threshold for filtering results (default: 0.5)
+- `LOG_LEVEL` - Logging level (default: INFO)
 
 ### S3 Vectors Configuration
 
@@ -208,16 +264,22 @@ Monitor the Lambda function logs:
 
 **Successful Operation:**
 ```
-Processing document ID: document-123
-Document class: invoice
-Response contains 2 elements
+=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===
+=== EXTRACTION CONFIG ===
+Model: anthropic.claude-3-5-sonnet-20241022-v2:0
+=== HANDLE INPUT DOCUMENT ===
+=== OUTPUT ANALYSIS ===
+Output keys: ['system_prompt', 'task_prompt_content']
+Task prompt content items: 5
+=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===
 ```
 
 **Error Conditions:**
 ```
-No class_label found in event
-No document_texts found in event or not in list format
-Failed to load example images from s3://bucket/path: error
+Failed to parse environment variables: KeyError('S3VECTOR_BUCKET')
+Skipping example with empty attributesPrompt: example_key
+Skipping example with distance 0.8 above threshold 0.5: example_key
+Invalid file path /local/path - expecting S3 URI
 ```
 
 ### Performance Monitoring
@@ -331,22 +393,53 @@ aws cloudformation delete-stack --stack-name GENAIIDP-dynamic-few-shot-stack
 
 ### Configuration in IDP Stack
 
-Add the dynamic-few shot Lambda ARN to your IDP configuration:
+Add the dynamic few-shot Lambda ARN to your IDP extraction configuration:
 
 ```yaml
-# In your IDP stack parameters or configuration
 extraction:
-  dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
+  custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot"
 ```
 
+### Required Task Prompt Configuration
+
+**Critical**: Your extraction task prompt must include the `{FEW_SHOT_EXAMPLES}` placeholder where you want the dynamic examples to be inserted. The Lambda specifically looks for this placeholder and replaces it with retrieved examples.
+
 ### Expected Behavior
 
 When configured:
 1. IDP processes document and extracts images/text
-2. Dynamic few-shot Lambda is invoked with document data
-3. Lambda returns similar examples with prompts and images
-4. IDP includes examples in extraction prompt to Bedrock
-5. Bedrock uses examples to improve extraction accuracy
+2. IDP invokes the dynamic few-shot Lambda with full extraction context
+3. Lambda generates embeddings from document images using Amazon Nova
+4. Lambda queries S3 Vectors to find similar examples
+5. Lambda loads example images and metadata from S3
+6. Lambda builds modified prompt content with examples inserted at `{FEW_SHOT_EXAMPLES}` location
+7. IDP uses the modified prompt content for Bedrock extraction
+8. Bedrock uses the dynamic examples to improve extraction accuracy
+
+### Prompt Flow Example
+
+**Original Task Prompt:**
+```
+Extract attributes from this invoice:
+{ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
+{FEW_SHOT_EXAMPLES}
+<<CACHEPOINT>>
+Document: {DOCUMENT_TEXT}
+```
+
+**After Lambda Processing:**
+```
+Extract attributes from this invoice:
+invoice_number [Unique identifier]...
+
+expected attributes are:
+    "invoice_number": "INV-2024-001",
+    "total_amount": "$1,250.00"
+[Example image content]
+
+<<CACHEPOINT>>
+Document: INVOICE #INV-2024-002...
+```
 
 ## Next Steps
 
diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 168ea957..71f47f81 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -39,7 +39,7 @@ Parameters:
 
   Threshold:
     Type: Number
-    Default: 0.2
+    Default: 0.5
     Description: Filter results exceeding this similarity threshold (lower is more similar)
 
   LambdaFunctionName:
@@ -54,6 +54,10 @@ Parameters:
     Type: String
     Description: "GenAIIDP S3OutputBucketName"
 
+  GenAIIDPS3WorkingBucketName:
+    Type: String
+    Description: "GenAIIDP WorkingBucket Name"
+
   GenAIIDPCustomerManagedEncryptionKeyArn:
     Type: String
     Description: "GenAIIDP CustomerManagedEncryptionKey ARN"
@@ -106,7 +110,7 @@ Resources:
         - S3ReadPolicy:
             BucketName: !Ref GenAIIDPS3OutputBucketName
         - S3ReadPolicy:
-            BucketName: !Ref DynamicFewShotDatasetBucket
+            BucketName: !Ref GenAIIDPS3WorkingBucketName
         - Statement:
             - Effect: Allow
               Action: cloudwatch:PutMetricData
@@ -173,28 +177,6 @@ Resources:
           - "imagePath"
       VectorBucketArn: !Ref DynamicFewShotVectorBucket
 
-  DynamicFewShotDatasetBucket:
-    Type: AWS::S3::Bucket
-    DeletionPolicy: RetainExceptOnCreate
-    Metadata:
-      cfn_nag:
-        rules_to_suppress:
-          - id: W84
-            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
-    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
-    Properties:
-      BucketEncryption:
-        ServerSideEncryptionConfiguration:
-          - ServerSideEncryptionByDefault:
-              SSEAlgorithm: "AES256"
-      PublicAccessBlockConfiguration:
-        BlockPublicAcls: true
-        BlockPublicPolicy: true
-        IgnorePublicAcls: true
-        RestrictPublicBuckets: true
-      VersioningConfiguration:
-        Status: Enabled
-
 Outputs:
 
   DynamicFewShotFunctionName:

From 72c85f7107ebf610aa365b25a6f2a7250fb7a56a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:29:16 +0100
Subject: [PATCH 21/39] chore: remove FATURA2 dataset import

---
 .../notebooks/fewshot_dataset_import.ipynb    | 487 ------------------
 1 file changed, 487 deletions(-)
 delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb

diff --git a/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb
deleted file mode 100644
index dc56a646..00000000
--- a/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb
+++ /dev/null
@@ -1,487 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Few-shot Dataset Import to S3 Vector store\n",
-    "\n",
-    "This notebook demonstrates how to import the FATURA2 dataset into S3 Vectors for use with the examples-provider Lambda function.\n",
-    "\n",
-    "The FATURA2 dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n",
-    "\n",
-    "## Process Overview:\n",
-    "\n",
-    "1. **Load FATURA2 Dataset** - Download and process the dataset\n",
-    "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n",
-    "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n",
-    "4. **Verify Import** - Test similarity search functionality\n",
-    "\n",
-    "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Install Dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Let's make sure that modules are autoreloaded\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "ROOTDIR=\"../..\"\n",
-    "# First uninstall existing package (to ensure we get the latest version)\n",
-    "%pip uninstall -y idp_common\n",
-    "\n",
-    "# Install the IDP common package with all components in development mode\n",
-    "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n",
-    "\n",
-    "# Note: We can also install specific components like:\n",
-    "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n",
-    "\n",
-    "# Check installed version\n",
-    "%pip show idp_common | grep -E \"Version|Location\"\n",
-    "\n",
-    "# Install required packages\n",
-    "%pip install -q pillow requests tqdm pandas\n",
-    "\n",
-    "# Optionally use a .env file for environment variables\n",
-    "try:\n",
-    "    from dotenv import load_dotenv\n",
-    "    load_dotenv()  \n",
-    "except ImportError:\n",
-    "    pass"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Import Libraries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import zipfile\n",
-    "import requests\n",
-    "from pathlib import Path\n",
-    "from typing import Dict, List, Any\n",
-    "from tqdm import tqdm\n",
-    "import pandas as pd\n",
-    "\n",
-    "import boto3\n",
-    "from PIL import Image\n",
-    "\n",
-    "# Import IDP common modules\n",
-    "from idp_common import bedrock\n",
-    "\n",
-    "print(\"Libraries imported successfully\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Configure S3 Vectors and Bedrock"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n",
-    "S3_BUCKET_FOR_IMAGES = \"<your-dynamic-few-shot-bucket>\" # Stack output 'DynamicFewShotDatasetBucket'\n",
-    "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n",
-    "S3_VECTORS_INDEX = \"documents\"\n",
-    "\n",
-    "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n",
-    "EMBEDDING_DIMENSIONS = 3072\n",
-    "\n",
-    "# Initialize clients\n",
-    "s3vectors_client = boto3.client('s3vectors')\n",
-    "s3_client = boto3.client('s3')\n",
-    "bedrock_client = bedrock.BedrockClient()\n",
-    "\n",
-    "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n",
-    "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n",
-    "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Load FATURA2 Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Download and extract FATURA2 dataset from Zenodo\n",
-    "print(\"Downloading FATURA2 dataset...\")\n",
-    "\n",
-    "# Configuration for this dataset\n",
-    "IMAGE_VARIANT = 'colored_images'\n",
-    "ANNOTATION_VARIANT = 'Original_Format'\n",
-    "CLASS_LABEL = 'invoice'\n",
-    "\n",
-    "# Create datasets directory\n",
-    "datasets_dir = Path('datasets')\n",
-    "datasets_dir.mkdir(exist_ok=True)\n",
-    "\n",
-    "# Download the zip file\n",
-    "zip_url = 'https://zenodo.org/records/10371464/files/FATURA2.zip?download=1'\n",
-    "zip_path = datasets_dir / 'FATURA2.zip'\n",
-    "\n",
-    "if not zip_path.exists():\n",
-    "    response = requests.get(zip_url, stream=True)\n",
-    "    response.raise_for_status()\n",
-    "    \n",
-    "    with open(zip_path, 'wb') as f:\n",
-    "        for chunk in tqdm(response.iter_content(chunk_size=8192), desc='Downloading'):\n",
-    "            f.write(chunk)\n",
-    "    print(f\"Downloaded {zip_path}\")\n",
-    "else:\n",
-    "    print(f\"Using existing {zip_path}\")\n",
-    "\n",
-    "# Extract the zip file\n",
-    "extract_dir = datasets_dir / 'invoices_dataset_final'\n",
-    "if not extract_dir.exists():\n",
-    "    with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
-    "        zip_ref.extractall(datasets_dir)\n",
-    "    print(f\"Extracted to {extract_dir}\")\n",
-    "else:\n",
-    "    print(f\"Using existing {extract_dir}\")\n",
-    "\n",
-    "colored_images = extract_dir / IMAGE_VARIANT\n",
-    "\n",
-    "# Load images from extracted directory\n",
-    "image_files = list(colored_images.glob('**/*.jpg'))\n",
-    "print(f\"Found {len(image_files)} {IMAGE_VARIANT} files\")\n",
-    "\n",
-    "# Show sample\n",
-    "if image_files:\n",
-    "    sample_image = Image.open(image_files[0])\n",
-    "    print(f\"Sample image: {image_files[0].name}\")\n",
-    "    print(f\"Image size: {sample_image.size}\")\n",
-    "\n",
-    "print(f\"Image variant: {IMAGE_VARIANT}\")\n",
-    "print(f\"Annotation variant: {ANNOTATION_VARIANT}\")\n",
-    "print(f\"Class label: {CLASS_LABEL}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Process Dataset and Generate Embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n",
-    "    \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n",
-    "    s3_client.put_object(\n",
-    "        Bucket=S3_BUCKET_FOR_IMAGES,\n",
-    "        Key=s3_key,\n",
-    "        Body=image_bytes,\n",
-    "        ContentType='image/jpeg'\n",
-    "    )\n",
-    "    return f\"s3://{S3_BUCKET_FOR_IMAGES}/{s3_key}\"\n",
-    "\n",
-    "def load_split(extract_dir, split_name):\n",
-    "    csv_path = extract_dir / (split_name + \".csv\")\n",
-    "    return pd.read_csv(csv_path)\n",
-    "\n",
-    "def read_annotation(extract_dir, annot_path):\n",
-    "    json_path = extract_dir / \"Annotations\" / ANNOTATION_VARIANT / annot_path\n",
-    "    with open(json_path, \"r\") as f:\n",
-    "        annotation = f.read()\n",
-    "    return json.loads(annotation)\n",
-    "\n",
-    "def load_image(extract_dir, img_path):\n",
-    "    image_path = extract_dir / IMAGE_VARIANT / img_path\n",
-    "    with open(image_path, \"rb\") as f:\n",
-    "        image_content = f.read()\n",
-    "    return image_content\n",
-    "\n",
-    "def map_labels(annotations):\n",
-    "    labels = {}\n",
-    "    labels['invoice_number'] = annotations.get(\"NUMBER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['invoice_date'] = annotations.get(\"DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['due_date'] = annotations.get(\"DUE_DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['vendor_name'] = annotations.get(\"SELLER_NAME\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['vendor_address'] = annotations.get(\"SELLER_ADDRESS\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    BUYER = annotations.get(\"BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['customer_name'] = BUYER[0] if len(BUYER) > 0 else []\n",
-    "    labels['customer_address'] = BUYER[1:] if len(BUYER) > 1 else []\n",
-    "    labels['items'] = \"null\"\n",
-    "    labels['quantities'] = \"null\"\n",
-    "    labels['unit_prices'] = \"null\"\n",
-    "    labels['subtotal'] = annotations.get(\"SUB_TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['tax'] = annotations.get(\"TAX\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['total_amount'] = annotations.get(\"TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['payment_terms'] = annotations.get(\"NOTE\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    labels['po_number'] = annotations.get(\"GSTIN_BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n",
-    "    return labels\n",
-    "\n",
-    "def get_attributes_prompt(labels):\n",
-    "    attributes_prompt = f\"\"\"expected attributes are:\n",
-    "        \"invoice_number\": {\", \".join(labels['invoice_number'])}\n",
-    "        \"invoice_date\": {\", \".join(labels['invoice_date'])}\n",
-    "        \"due_date\": {\", \".join(labels['due_date'])}\n",
-    "        \"vendor_name\": {\", \".join(labels['vendor_name'])}\n",
-    "        \"vendor_address\": {\", \".join(labels['vendor_address'])}\n",
-    "        \"customer_name\": {labels['customer_name']}\n",
-    "        \"customer_address\": {\", \".join(labels['customer_address'])}\n",
-    "        \"items\": {labels['items']}\n",
-    "        \"quantities\": {labels['quantities']}\n",
-    "        \"unit_prices\": {labels['unit_prices']}\n",
-    "        \"subtotal\": {\", \".join(labels['subtotal'])}\n",
-    "        \"tax\": {\", \".join(labels['tax'])}\n",
-    "        \"total_amount\": {\", \".join(labels['total_amount'])}\n",
-    "        \"payment_terms\": {\", \".join(labels['payment_terms'])}\n",
-    "        \"po_number\": {\", \".join(labels['po_number'])}\n",
-    "    \"\"\".strip()\n",
-    "    return attributes_prompt\n",
-    "\n",
-    "def create_metadata(annotations: Dict, s3_image_uri: str) -> Dict:\n",
-    "    \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n",
-    "    class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n",
-    "\n",
-    "    labels = map_labels(annotations)\n",
-    "    attributes_prompt = get_attributes_prompt(labels)\n",
-    "\n",
-    "    return {\n",
-    "        \"classLabel\": CLASS_LABEL,\n",
-    "        \"classPrompt\": class_prompt,\n",
-    "        \"attributesPrompt\": attributes_prompt,\n",
-    "        \"imagePath\": s3_image_uri,\n",
-    "    }\n",
-    "\n",
-    "print(\"Helper functions defined\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 6. Import Dataset to S3 Vectors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Process a subset of the dataset (adjust as needed)\n",
-    "MAX_SAMPLES = 100  # Adjust this number based on your needs\n",
-    "BATCH_SIZE = 10    # Adjust this number based on your needs\n",
-    "\n",
-    "dataset_split = load_split(extract_dir, \"strat1_train\")\n",
-    "samples_to_process = min(MAX_SAMPLES, len(dataset_split))\n",
-    "\n",
-    "print(f\"Processing {samples_to_process} samples from FATURA2 dataset...\")\n",
-    "\n",
-    "vectors_to_upload = []\n",
-    "failed_samples = []\n",
-    "\n",
-    "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n",
-    "    try:\n",
-    "        df_image = dataset_split.iloc[i]\n",
-    "\n",
-    "        # Load annotations\n",
-    "        annotations = read_annotation(extract_dir, df_image[\"annot_path\"])\n",
-    "        \n",
-    "        # Load image\n",
-    "        image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n",
-    "\n",
-    "        # Upload image to S3\n",
-    "        s3_key = f\"fatura2/{IMAGE_VARIANT}/{df_image['img_path']}\"\n",
-    "        s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n",
-    "        \n",
-    "        # Generate embedding\n",
-    "        embedding = bedrock_client.generate_embedding(\n",
-    "            image_source=image_bytes,\n",
-    "            model_id=EMBEDDING_MODEL_ID,\n",
-    "            dimensions=EMBEDDING_DIMENSIONS\n",
-    "        )\n",
-    "        \n",
-    "        # Create metadata\n",
-    "        metadata = create_metadata(annotations, s3_image_uri)\n",
-    "\n",
-    "        # Prepare vector for upload\n",
-    "        vector_entry = {\n",
-    "            \"key\": f\"fatura2_sample_{i:06d}\",\n",
-    "            \"data\": {\"float32\": embedding},\n",
-    "            \"metadata\": metadata\n",
-    "        }\n",
-    "\n",
-    "        vectors_to_upload.append(vector_entry)\n",
-    "        \n",
-    "        # Upload in batches to avoid memory issues\n",
-    "        if len(vectors_to_upload) >= BATCH_SIZE:  # Batch size\n",
-    "            print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n",
-    "            response = s3vectors_client.put_vectors(\n",
-    "                vectorBucketName=S3_VECTORS_BUCKET,\n",
-    "                indexName=S3_VECTORS_INDEX,\n",
-    "                vectors=vectors_to_upload\n",
-    "            )\n",
-    "            print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
-    "            vectors_to_upload = []  # Clear batch\n",
-    "            \n",
-    "    except Exception as e:\n",
-    "        print(f\"\\nFailed to process sample {i}: {e}\")\n",
-    "        failed_samples.append(i)\n",
-    "        continue\n",
-    "\n",
-    "# Upload remaining vectors\n",
-    "if vectors_to_upload:\n",
-    "    print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n",
-    "    response = s3vectors_client.put_vectors(\n",
-    "        vectorBucketName=S3_VECTORS_BUCKET,\n",
-    "        indexName=S3_VECTORS_INDEX,\n",
-    "        vectors=vectors_to_upload\n",
-    "    )\n",
-    "    print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
-    "\n",
-    "print(f\"\\nImport completed!\")\n",
-    "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples\")\n",
-    "print(f\"Failed samples: {len(failed_samples)}\")\n",
-    "if failed_samples:\n",
-    "    print(f\"Failed sample indices: {failed_samples[:10]}...\")  # Show first 10"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 7. Verify Import with Similarity Search"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test similarity search with a sample from the dataset\n",
-    "test_split = load_split(extract_dir, \"strat1_test\")\n",
-    "\n",
-    "test_sample_index = 0\n",
-    "df_image = test_split.iloc[test_sample_index]\n",
-    "\n",
-    "test_image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n",
-    "\n",
-    "print(f\"Testing similarity search with sample {extract_dir / IMAGE_VARIANT / df_image['img_path']}...\")\n",
-    "\n",
-    "# Generate embedding for test image\n",
-    "test_embedding = bedrock_client.generate_embedding(\n",
-    "    image_source=test_image_bytes,\n",
-    "    model_id=EMBEDDING_MODEL_ID,\n",
-    "    dimensions=EMBEDDING_DIMENSIONS\n",
-    ")\n",
-    "\n",
-    "# Query S3 Vectors for similar examples\n",
-    "response = s3vectors_client.query_vectors(\n",
-    "    vectorBucketName=S3_VECTORS_BUCKET,\n",
-    "    indexName=S3_VECTORS_INDEX,\n",
-    "    queryVector={\"float32\": test_embedding},\n",
-    "    topK=5,\n",
-    "    returnDistance=True,\n",
-    "    returnMetadata=True\n",
-    ")\n",
-    "\n",
-    "print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n",
-    "for i, vector in enumerate(response['vectors']):\n",
-    "    distance = vector.get('distance', 'N/A')\n",
-    "    key = vector.get('key', 'N/A')\n",
-    "    metadata = vector.get('metadata', {})\n",
-    "    class_label = metadata.get('classLabel', 'N/A')\n",
-    "    class_prompt = metadata.get('classPrompt', 'N/A')\n",
-    "    attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n",
-    "    image_path = metadata.get('imagePath', 'N/A')\n",
-    "    \n",
-    "    print(f\"  {i+1}. Key: {key}\")\n",
-    "    print(f\"     Distance: {distance:.4f}\")\n",
-    "    print(f\"     Class Label: {image_path}\")\n",
-    "    print(f\"     Class Prompt: {class_prompt}\")\n",
-    "    print(f\"     Attributes Prompt: {attributes_prompt}\")\n",
-    "    print(f\"     Image Path: {image_path}\")\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 8. Summary and Next Steps"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"=== Few-shot Dataset Import Summary ===\")\n",
-    "print(f\"✅ Dataset: FATURA2 (Invoice documents)\")\n",
-    "print(f\"✅ Samples processed: {samples_to_process - len(failed_samples)}\")\n",
-    "print(f\"✅ S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n",
-    "print(f\"✅ S3 Vectors Index: {S3_VECTORS_INDEX}\")\n",
-    "print(f\"✅ Images stored in: s3://{S3_BUCKET_FOR_IMAGES}/fatura2/{IMAGE_VARIANT}/\")\n",
-    "print(f\"✅ Embedding Model: {EMBEDDING_MODEL_ID}\")\n",
-    "print(f\"✅ Similarity search verified\")\n",
-    "\n",
-    "print(\"\\n=== Next Steps ===\")\n",
-    "print(\"1. Upload your own datasets into S3 Vectors\")\n",
-    "print(\"2. Configure your IDP extraction to use the examples provider Lambda ARN\")\n",
-    "print(\"3. Test document processing with few-shot examples!\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From 3c50242b5dd340f0c66085c5509324755535e056 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:30:14 +0100
Subject: [PATCH 22/39] feat: add fcc_invoices (REALKIE) dataset import

---
 .../fcc_invoices_dataset_import.ipynb         | 761 ++++++++++++++++++
 1 file changed, 761 insertions(+)
 create mode 100644 plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb

diff --git a/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb
new file mode 100644
index 00000000..2dc1fdce
--- /dev/null
+++ b/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb
@@ -0,0 +1,761 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FCC Invoices Dataset Import to S3 Vector store\n",
+    "\n",
+    "This notebook demonstrates how to import the FCC invoices (REALKIE) dataset into S3 Vectors for use with the dynamic few-shot Lambda function.\n",
+    "\n",
+    "The FCC invoices dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n",
+    "\n",
+    "## Process Overview:\n",
+    "\n",
+    "1. **Load FCC Invoices Dataset** - Sync and load the dataset using load_dataset()\n",
+    "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n",
+    "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n",
+    "4. **Verify Import** - Test similarity search functionality\n",
+    "\n",
+    "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's make sure that modules are autoreloaded\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "ROOTDIR=\"../../../\"\n",
+    "# First uninstall existing package (to ensure we get the latest version)\n",
+    "%pip uninstall -y idp_common\n",
+    "\n",
+    "# Install the IDP common package with all components in development mode\n",
+    "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n",
+    "\n",
+    "# Note: We can also install specific components like:\n",
+    "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n",
+    "\n",
+    "# Check installed version\n",
+    "%pip show idp_common | grep -E \"Version|Location\"\n",
+    "\n",
+    "# Install required packages\n",
+    "%pip install -q pillow tqdm pandas datasets matplotlib\n",
+    "\n",
+    "# Optionally use a .env file fxor environment variables\n",
+    "try:\n",
+    "    from dotenv import load_dotenv\n",
+    "    load_dotenv()  \n",
+    "except ImportError:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import subprocess\n",
+    "from pathlib import Path\n",
+    "from typing import Dict, List, Any\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "import io\n",
+    "\n",
+    "import boto3\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Import IDP common modules\n",
+    "from idp_common import bedrock\n",
+    "\n",
+    "print(\"Libraries imported successfully\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Configure S3 Vectors and Bedrock"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configuration - Update these values from the IDP stack in CloudFormation Resources tab\n",
+    "GENAIIDP_S3_WORKING_BUCKET = \"<s3-working-bucket>\" # From IDP stack Resources tab\n",
+    "\n",
+    "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n",
+    "S3_VECTORS_INDEX = \"documents\"\n",
+    "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n",
+    "EMBEDDING_DIMENSIONS = 3072\n",
+    "\n",
+    "# Initialize clients\n",
+    "s3vectors_client = boto3.client('s3vectors')\n",
+    "s3_client = boto3.client('s3')\n",
+    "bedrock_client = bedrock.BedrockClient()\n",
+    "\n",
+    "print(f\"Configured for dataset S3 Bucket: {GENAIIDP_S3_WORKING_BUCKET}\")\n",
+    "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n",
+    "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n",
+    "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")\n",
+    "print(f\"Using embedding dimensions: {EMBEDDING_DIMENSIONS}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Load FCC Invoices Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sync FCC invoices dataset from S3\n",
+    "print(\"Syncing FCC invoices dataset from S3...\")\n",
+    "\n",
+    "# Configuration for this dataset\n",
+    "CLASS_LABEL = 'Invoice'\n",
+    "\n",
+    "# Create datasets directory\n",
+    "dataset_root_dir = Path('../datasets')\n",
+    "dataset_root_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "# Dataset directory\n",
+    "dataset_dir = dataset_root_dir / 'fcc_invoices'\n",
+    "\n",
+    "# Sync dataset from S3 using AWS CLI with Wasabi endpoint\n",
+    "if not dataset_dir.exists() or not any(dataset_dir.iterdir()):\n",
+    "    print(\"Syncing dataset from S3...\")\n",
+    "    sync_command = [\n",
+    "        'aws', 's3', 'sync',\n",
+    "        's3://project-fruitfly/fcc_invoices',\n",
+    "        str(dataset_dir),\n",
+    "        '--endpoint-url=https://s3.us-east-2.wasabisys.com',\n",
+    "        '--no-sign-request'\n",
+    "    ]\n",
+    "    \n",
+    "    try:\n",
+    "        result = subprocess.run(sync_command, capture_output=True, text=True, check=True)\n",
+    "        print(f\"Dataset synced successfully to {dataset_dir}\")\n",
+    "        print(f\"Sync output: {result.stdout}\")\n",
+    "    except subprocess.CalledProcessError as e:\n",
+    "        print(f\"Error syncing dataset: {e}\")\n",
+    "        print(f\"Error output: {e.stderr}\")\n",
+    "        raise\n",
+    "else:\n",
+    "    print(f\"Using existing dataset at {dataset_dir}\")\n",
+    "\n",
+    "# Load the training dataset using load_dataset\n",
+    "print(\"Loading training dataset...\")\n",
+    "try:\n",
+    "    # Load dataset from local directory\n",
+    "    dataset = load_dataset('csv', data_dir=str(dataset_dir), split='train')\n",
+    "    print(f\"Loaded dataset with {len(dataset)} samples\")\n",
+    "    \n",
+    "    # Show sample information\n",
+    "    if len(dataset) > 0:\n",
+    "        sample = dataset[0]\n",
+    "        print(f\"Sample keys: {list(sample.keys())}\")\n",
+    "        if 'image' in sample:\n",
+    "            print(f\"Sample image size: {sample['image'].size}\")\n",
+    "        \n",
+    "except Exception as e:\n",
+    "    print(f\"Error loading dataset: {e}\")\n",
+    "    # Fallback: list files in directory\n",
+    "    image_files = list(dataset_dir.glob('**/*.jpg')) + list(dataset_dir.glob('**/*.png'))\n",
+    "    print(f\"Found {len(image_files)} image files in directory\")\n",
+    "    if image_files:\n",
+    "        print(f\"Sample image: {image_files[0].name}\")\n",
+    "        print(f\"Image file size: {image_files[0].stat().st_size} bytes\")\n",
+    "\n",
+    "print(f\"Class label: {CLASS_LABEL}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Process Dataset and Generate Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n",
+    "    \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n",
+    "    s3_client.put_object(\n",
+    "        Bucket=GENAIIDP_S3_WORKING_BUCKET,\n",
+    "        Key=s3_key,\n",
+    "        Body=image_bytes,\n",
+    "        ContentType='image/jpeg'\n",
+    "    )\n",
+    "    return f\"s3://{GENAIIDP_S3_WORKING_BUCKET}/{s3_key}\"\n",
+    "\n",
+    "def load_csv_labels():\n",
+    "    \"\"\"Load the CSV file with labels and metadata.\"\"\"\n",
+    "    csv_path = dataset_dir / 'train.csv'\n",
+    "    if csv_path.exists():\n",
+    "        try:\n",
+    "            df = pd.read_csv(csv_path)\n",
+    "            print(f\"Loaded CSV with {len(df)} rows\")\n",
+    "            return df\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error loading CSV: {e}\")\n",
+    "            return None\n",
+    "    else:\n",
+    "        print(f\"CSV file not found at {csv_path}\")\n",
+    "        return None\n",
+    "\n",
+    "def match_image_to_csv_row(image_path: str, csv_df: pd.DataFrame):\n",
+    "    \"\"\"Match an image path to the corresponding CSV row.\"\"\"\n",
+    "    if csv_df is None:\n",
+    "        return None\n",
+    "    \n",
+    "    # Extract the image filename from the path\n",
+    "    image_name = Path(image_path).name\n",
+    "    \n",
+    "    # Look for matching rows in the CSV\n",
+    "    for idx, row in csv_df.iterrows():\n",
+    "        image_files_str = row.get('image_files', '')\n",
+    "        if image_name in image_files_str:\n",
+    "            return row\n",
+    "    \n",
+    "    return None\n",
+    "\n",
+    "def get_image_bytes_from_file(image_path):\n",
+    "    \"\"\"Read image file directly as bytes.\"\"\"\n",
+    "    with open(image_path, 'rb') as f:\n",
+    "        return f.read()\n",
+    "\n",
+    "def create_sample_attributes_prompt() -> str:\n",
+    "    \"\"\"Create a sample attributes prompt for FCC invoices based on the actual schema.\"\"\"\n",
+    "    # Updated to match the actual FCC invoices dataset structure and expected JSON schema\n",
+    "    attributes_prompt = \"\"\"expected attributes are:\n",
+    "        \"Agency\": \"Great American Media\",\n",
+    "        \"Advertiser\": \"ISS/HOUSE MAJ PAC\", \n",
+    "        \"GrossTotal\": 94700.00,\n",
+    "        \"PaymentTerms\": \"Cash In Advance\",\n",
+    "        \"AgencyCommission\": 14205.00,\n",
+    "        \"NetAmountDue\": 80495.00,\n",
+    "        \"LineItems\": [\n",
+    "            {\n",
+    "                \"LineItemDescription\": \"TODAY IN FLORIDA @9PM\",\n",
+    "                \"LineItemStartDate\": \"10/18/2016\", \n",
+    "                \"LineItemEndDate\": null,\n",
+    "                \"LineItemDays\": [\"T\"],\n",
+    "                \"LineItemRate\": 500.00\n",
+    "            },\n",
+    "            {\n",
+    "                \"LineItemDescription\": \"CH 7 NEWS @ 10PM\",\n",
+    "                \"LineItemStartDate\": \"10/18/2016\",\n",
+    "                \"LineItemEndDate\": null, \n",
+    "                \"LineItemDays\": [\"T\"],\n",
+    "                \"LineItemRate\": 3200.00\n",
+    "            }\n",
+    "        ]\n",
+    "    \"\"\".strip()\n",
+    "    return attributes_prompt\n",
+    "\n",
+    "def parse_ground_truth_labels(labels_json_str: str) -> Dict:\n",
+    "    \"\"\"Parse ground truth labels from the dataset and convert to expected format.\"\"\"\n",
+    "    import json\n",
+    "    \n",
+    "    try:\n",
+    "        labels = json.loads(labels_json_str)\n",
+    "    except (json.JSONDecodeError, TypeError):\n",
+    "        return None\n",
+    "    \n",
+    "    # Initialize the result structure\n",
+    "    result = {\n",
+    "        \"Agency\": None,\n",
+    "        \"Advertiser\": None,\n",
+    "        \"GrossTotal\": None,\n",
+    "        \"PaymentTerms\": None,\n",
+    "        \"AgencyCommission\": None,\n",
+    "        \"NetAmountDue\": None,\n",
+    "        \"LineItems\": []\n",
+    "    }\n",
+    "    \n",
+    "    # Group line items by their properties\n",
+    "    line_items = {}\n",
+    "    \n",
+    "    for label in labels:\n",
+    "        label_type = label.get('label', '')\n",
+    "        text = label.get('text', '')\n",
+    "        \n",
+    "        # Map top-level fields\n",
+    "        if label_type == 'Agency':\n",
+    "            result['Agency'] = text\n",
+    "        elif label_type == 'Advertiser':\n",
+    "            result['Advertiser'] = text\n",
+    "        elif label_type == 'Gross Total':\n",
+    "            try:\n",
+    "                result['GrossTotal'] = float(text.replace(',', '').replace('$', ''))\n",
+    "            except ValueError:\n",
+    "                result['GrossTotal'] = text\n",
+    "        elif label_type == 'Net Amount Due':\n",
+    "            try:\n",
+    "                result['NetAmountDue'] = float(text.replace(',', '').replace('$', ''))\n",
+    "            except ValueError:\n",
+    "                result['NetAmountDue'] = text\n",
+    "        elif label_type == 'Payment Terms':\n",
+    "            result['PaymentTerms'] = text\n",
+    "        elif label_type == 'Agency Commission':\n",
+    "            try:\n",
+    "                result['AgencyCommission'] = float(text.replace(',', '').replace('$', ''))\n",
+    "            except ValueError:\n",
+    "                result['AgencyCommission'] = text\n",
+    "        \n",
+    "        # Handle line items (group by position or create separate items)\n",
+    "        elif label_type.startswith('Line Item - '):\n",
+    "            field_name = label_type.replace('Line Item - ', '')\n",
+    "            start_pos = label.get('start', 0)\n",
+    "            \n",
+    "            # Use start position to group related line item fields\n",
+    "            # Find the closest line item group\n",
+    "            closest_key = None\n",
+    "            min_distance = float('inf')\n",
+    "            \n",
+    "            for key in line_items.keys():\n",
+    "                distance = abs(start_pos - key)\n",
+    "                if distance < min_distance and distance < 1000:  # Within reasonable range\n",
+    "                    min_distance = distance\n",
+    "                    closest_key = key\n",
+    "            \n",
+    "            if closest_key is None:\n",
+    "                closest_key = start_pos\n",
+    "                line_items[closest_key] = {}\n",
+    "            \n",
+    "            # Map field names to expected schema\n",
+    "            if field_name == 'Description':\n",
+    "                line_items[closest_key]['LineItemDescription'] = text\n",
+    "            elif field_name == 'Start Date':\n",
+    "                line_items[closest_key]['LineItemStartDate'] = text\n",
+    "            elif field_name == 'End Date':\n",
+    "                line_items[closest_key]['LineItemEndDate'] = text if text else None\n",
+    "            elif field_name == 'Rate':\n",
+    "                try:\n",
+    "                    line_items[closest_key]['LineItemRate'] = float(text.replace(',', '').replace('$', ''))\n",
+    "                except ValueError:\n",
+    "                    line_items[closest_key]['LineItemRate'] = text\n",
+    "            elif field_name == 'Days':\n",
+    "                # Convert day codes to day names\n",
+    "                day_mapping = {\n",
+    "                    'M': 'M', 'T': 'T', 'W': 'W', 'Th': 'Th', 'F': 'F', 'S': 'S', 'Su': 'Su',\n",
+    "                    '1': 'M', '2': 'T', '3': 'W', '4': 'Th', '5': 'F', '6': 'S', '7': 'Su'\n",
+    "                }\n",
+    "                days = []\n",
+    "                for char in text:\n",
+    "                    if char in day_mapping and char != '-':\n",
+    "                        mapped_day = day_mapping[char]\n",
+    "                        if mapped_day not in days:\n",
+    "                            days.append(mapped_day)\n",
+    "                line_items[closest_key]['LineItemDays'] = days\n",
+    "    \n",
+    "    # Convert line items dict to list\n",
+    "    result['LineItems'] = list(line_items.values())\n",
+    "    \n",
+    "    return result\n",
+    "\n",
+    "def create_metadata(s3_image_uri: str, sample_data: Dict = None) -> Dict:\n",
+    "    \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n",
+    "    class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n",
+    "    \n",
+    "    # If we have actual sample data with labels, use it to create a more accurate attributes prompt\n",
+    "    if sample_data and 'labels' in sample_data:\n",
+    "        parsed_labels = parse_ground_truth_labels(sample_data['labels'])\n",
+    "        if parsed_labels:\n",
+    "            attributes_prompt = f\"expected attributes are: {json.dumps(parsed_labels, indent=2)}\"\n",
+    "        else:\n",
+    "            attributes_prompt = create_sample_attributes_prompt()\n",
+    "    else:\n",
+    "        attributes_prompt = create_sample_attributes_prompt()\n",
+    "\n",
+    "    return {\n",
+    "        \"classLabel\": CLASS_LABEL,\n",
+    "        \"classPrompt\": class_prompt,\n",
+    "        \"attributesPrompt\": attributes_prompt,\n",
+    "        \"imagePath\": s3_image_uri,\n",
+    "    }\n",
+    "\n",
+    "print(\"Helper functions defined\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Import Dataset to S3 Vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process a subset of the dataset (adjust as needed)\n",
+    "MAX_SAMPLES = 250  # Adjust this number based on your needs\n",
+    "BATCH_SIZE = 15    # Adjust this number based on your needs\n",
+    "\n",
+    "# Load the CSV labels (this contains the image_files information)\n",
+    "csv_df = load_csv_labels()\n",
+    "if csv_df is None:\n",
+    "    print(\"Failed to load CSV data. Exiting.\")\n",
+    "    raise Exception(\"CSV loading failed\")\n",
+    "\n",
+    "samples_to_process = min(MAX_SAMPLES, len(csv_df))\n",
+    "print(f\"Processing {samples_to_process} samples from FCC invoices CSV data...\")\n",
+    "\n",
+    "vectors_to_upload = []\n",
+    "failed_samples = []\n",
+    "\n",
+    "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n",
+    "    try:\n",
+    "        csv_row = csv_df.iloc[i]\n",
+    "        \n",
+    "        # Get image files from the CSV row\n",
+    "        image_files_str = csv_row.get('image_files', '')\n",
+    "        if not image_files_str:\n",
+    "            print(f\"No image files found for sample {i}\")\n",
+    "            failed_samples.append(i)\n",
+    "            continue\n",
+    "            \n",
+    "        # Parse the image files array (it's stored as a JSON string)\n",
+    "        import json\n",
+    "        try:\n",
+    "            image_files = json.loads(image_files_str)\n",
+    "        except json.JSONDecodeError:\n",
+    "            print(f\"Failed to parse image_files for sample {i}: {image_files_str}\")\n",
+    "            failed_samples.append(i)\n",
+    "            continue\n",
+    "        \n",
+    "        # Use the first image file (or you could process all images)\n",
+    "        if not image_files:\n",
+    "            print(f\"Empty image_files array for sample {i}\")\n",
+    "            failed_samples.append(i)\n",
+    "            continue\n",
+    "            \n",
+    "        # Load the first image file\n",
+    "        image_file_path = image_files[0]\n",
+    "        full_image_path = dataset_root_dir / image_file_path\n",
+    "        \n",
+    "        if not full_image_path.exists():\n",
+    "            print(f\"Image file not found: {full_image_path}\")\n",
+    "            failed_samples.append(i)\n",
+    "            continue\n",
+    "            \n",
+    "        # Load image file as bytes\n",
+    "        image_bytes = get_image_bytes_from_file(full_image_path)\n",
+    "\n",
+    "        # Upload image to S3\n",
+    "        s3_key = f\"fcc_invoices/sample_{i:06d}.jpg\"\n",
+    "        s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n",
+    "        \n",
+    "        # Generate embedding\n",
+    "        embedding = bedrock_client.generate_embedding(\n",
+    "            image_source=image_bytes,\n",
+    "            model_id=EMBEDDING_MODEL_ID,\n",
+    "            dimensions=EMBEDDING_DIMENSIONS\n",
+    "        )\n",
+    "        \n",
+    "        # Create metadata using the CSV row data\n",
+    "        sample_data = {'labels': csv_row.get('labels')}\n",
+    "        metadata = create_metadata(s3_image_uri, sample_data)\n",
+    "\n",
+    "        # Prepare vector for upload\n",
+    "        vector_entry = {\n",
+    "            \"key\": f\"fcc_invoices_sample_{i:06d}\",\n",
+    "            \"data\": {\"float32\": embedding},\n",
+    "            \"metadata\": metadata\n",
+    "        }\n",
+    "\n",
+    "        vectors_to_upload.append(vector_entry)\n",
+    "        \n",
+    "        # Upload in batches to avoid memory issues\n",
+    "        if len(vectors_to_upload) >= BATCH_SIZE:\n",
+    "            print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n",
+    "            response = s3vectors_client.put_vectors(\n",
+    "                vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "                indexName=S3_VECTORS_INDEX,\n",
+    "                vectors=vectors_to_upload\n",
+    "            )\n",
+    "            print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
+    "            vectors_to_upload = []  # Clear batch\n",
+    "            \n",
+    "    except Exception as e:\n",
+    "        print(f\"\\nFailed to process sample {i}: {e}\")\n",
+    "        failed_samples.append(i)\n",
+    "        continue\n",
+    "\n",
+    "# Upload remaining vectors\n",
+    "if vectors_to_upload:\n",
+    "    print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n",
+    "    response = s3vectors_client.put_vectors(\n",
+    "        vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "        indexName=S3_VECTORS_INDEX,\n",
+    "        vectors=vectors_to_upload\n",
+    "    )\n",
+    "    print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n",
+    "\n",
+    "print(f\"\\nImport completed!\")\n",
+    "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples from CSV data\")\n",
+    "print(f\"Failed samples: {len(failed_samples)}\")\n",
+    "if failed_samples:\n",
+    "    print(f\"Failed sample indices: {failed_samples[:10]}...\")  # Show first 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Verify Import with Similarity Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load test split for similarity search verification\n",
+    "test_dataset = load_dataset('csv', data_dir=str(dataset_dir), split='test')\n",
+    "print(f\"Loaded test dataset with {len(test_dataset)} samples\")\n",
+    "\n",
+    "if test_dataset is not None and len(test_dataset) > 0:\n",
+    "    # Use the first sample from test split\n",
+    "    test_sample_index = 0\n",
+    "    test_csv_row = test_dataset[test_sample_index]\n",
+    "    \n",
+    "    # Get test image from CSV row\n",
+    "    test_image_files_str = test_csv_row.get('image_files', '')\n",
+    "    if test_image_files_str:\n",
+    "        try:\n",
+    "            test_image_files = json.loads(test_image_files_str)\n",
+    "            if test_image_files:\n",
+    "                test_image_path = dataset_root_dir / test_image_files[0]\n",
+    "                if test_image_path.exists():\n",
+    "                    test_image_bytes = get_image_bytes_from_file(test_image_path)\n",
+    "                    print(f\"Loaded test image: {test_image_files[0]}\")\n",
+    "                else:\n",
+    "                    print(f\"Test image file not found: {test_image_path}\")\n",
+    "                    test_image_bytes = None\n",
+    "            else:\n",
+    "                print(\"Empty image_files array in test sample\")\n",
+    "                test_image_bytes = None\n",
+    "        except (json.JSONDecodeError, IndexError) as e:\n",
+    "            print(f\"Failed to parse test image_files: {e}\")\n",
+    "            test_image_bytes = None\n",
+    "    else:\n",
+    "        print(\"No image_files found in test sample\")\n",
+    "        test_image_bytes = None\n",
+    "else:\n",
+    "    print(\"Test split is empty or could not be loaded\")\n",
+    "    test_image_bytes = None\n",
+    "\n",
+    "if test_image_bytes is not None:\n",
+    "    print(f\"\\nTesting similarity search with test sample {test_sample_index}...\")\n",
+    "\n",
+    "    # Generate embedding for test image\n",
+    "    test_embedding = bedrock_client.generate_embedding(\n",
+    "        image_source=test_image_bytes,\n",
+    "        model_id=EMBEDDING_MODEL_ID,\n",
+    "        dimensions=EMBEDDING_DIMENSIONS\n",
+    "    )\n",
+    "else:\n",
+    "    print(\"No test image available for similarity search verification.\")\n",
+    "    test_embedding = None\n",
+    "\n",
+    "if test_embedding is not None:\n",
+    "    # Query S3 Vectors for similar examples\n",
+    "    response = s3vectors_client.query_vectors(\n",
+    "        vectorBucketName=S3_VECTORS_BUCKET,\n",
+    "        indexName=S3_VECTORS_INDEX,\n",
+    "        queryVector={\"float32\": test_embedding},\n",
+    "        topK=5,\n",
+    "        returnDistance=True,\n",
+    "        returnMetadata=True\n",
+    "    )\n",
+    "\n",
+    "    print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n",
+    "    for i, vector in enumerate(response['vectors']):\n",
+    "        distance = vector.get('distance', 'N/A')\n",
+    "        key = vector.get('key', 'N/A')\n",
+    "        metadata = vector.get('metadata', {})\n",
+    "        class_label = metadata.get('classLabel', 'N/A')\n",
+    "        class_prompt = metadata.get('classPrompt', 'N/A')\n",
+    "        attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n",
+    "        image_path = metadata.get('imagePath', 'N/A')\n",
+    "        \n",
+    "        print(f\"  {i+1}. Key: {key}\")\n",
+    "        print(f\"     Distance: {distance:.4f}\")\n",
+    "        print(f\"     Class Label: {class_label}\")\n",
+    "        print(f\"     Class Prompt: {class_prompt}\")\n",
+    "        print(f\"     Attributes Prompt: {attributes_prompt[:100]}...\")  # Truncate for readability\n",
+    "        print(f\"     Image Path: {image_path}\")\n",
+    "        print()\n",
+    "else:\n",
+    "    print(\"Skipping similarity search - no test embedding available.\")\n",
+    "\n",
+    "# Display source image and found similar images\n",
+    "if test_image_bytes is not None and 'response' in locals() and response.get('vectors'):\n",
+    "    import matplotlib.pyplot as plt\n",
+    "    from PIL import Image as PILImage\n",
+    "    import io\n",
+    "    \n",
+    "    # Calculate number of images to display (source + top similar images)\n",
+    "    num_similar = min(3, len(response['vectors']))  # Show top 3 similar images\n",
+    "    total_images = 1 + num_similar  # Source + similar images\n",
+    "    \n",
+    "    # Create subplot layout\n",
+    "    fig, axes = plt.subplots(1, total_images, figsize=(5 * total_images, 6))\n",
+    "    if total_images == 1:\n",
+    "        axes = [axes]  # Make it iterable for single image\n",
+    "    \n",
+    "    # Display source image\n",
+    "    source_img = PILImage.open(io.BytesIO(test_image_bytes))\n",
+    "    axes[0].imshow(source_img)\n",
+    "    axes[0].set_title(f'Source Image (Test Sample {test_sample_index})', fontsize=12, fontweight='bold')\n",
+    "    axes[0].axis('off')\n",
+    "    \n",
+    "    # Display similar images\n",
+    "    for i, vector in enumerate(response['vectors'][:num_similar]):\n",
+    "        try:\n",
+    "            # Get image path from metadata\n",
+    "            metadata = vector.get('metadata', {})\n",
+    "            image_s3_path = metadata.get('imagePath', '')\n",
+    "            distance = vector.get('distance', 0)\n",
+    "            \n",
+    "            if image_s3_path:\n",
+    "                # Extract S3 key from the full S3 URI\n",
+    "                s3_key = image_s3_path.replace(f's3://{GENAIIDP_S3_WORKING_BUCKET}/', '')\n",
+    "                \n",
+    "                # Download image from S3\n",
+    "                try:\n",
+    "                    response_obj = s3_client.get_object(Bucket=GENAIIDP_S3_WORKING_BUCKET, Key=s3_key)\n",
+    "                    image_data = response_obj['Body'].read()\n",
+    "                    similar_img = PILImage.open(io.BytesIO(image_data))\n",
+    "                    \n",
+    "                    # Display the image\n",
+    "                    axes[i + 1].imshow(similar_img)\n",
+    "                    axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n",
+    "                    axes[i + 1].axis('off')\n",
+    "                    \n",
+    "                except Exception as e:\n",
+    "                    # If can't load from S3, show placeholder\n",
+    "                    axes[i + 1].text(0.5, 0.5, f'Image not available\\n{str(e)[:50]}...', \n",
+    "                                    ha='center', va='center', transform=axes[i + 1].transAxes)\n",
+    "                    axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n",
+    "                    axes[i + 1].axis('off')\n",
+    "            else:\n",
+    "                # No image path available\n",
+    "                axes[i + 1].text(0.5, 0.5, 'No image path', ha='center', va='center', \n",
+    "                                transform=axes[i + 1].transAxes)\n",
+    "                axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n",
+    "                axes[i + 1].axis('off')\n",
+    "                \n",
+    "        except Exception as e:\n",
+    "            print(f'Error displaying similar image {i+1}: {e}')\n",
+    "            axes[i + 1].text(0.5, 0.5, f'Error: {str(e)[:30]}...', ha='center', va='center', \n",
+    "                            transform=axes[i + 1].transAxes)\n",
+    "            axes[i + 1].set_title(f'Similar #{i+1}', fontsize=10)\n",
+    "            axes[i + 1].axis('off')\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "    print(f'\\nDisplayed source image and top {num_similar} similar images from the vector store.')\n",
+    "    \n",
+    "else:\n",
+    "    print('No images to display - either no test image was loaded or no similar images were found.')\n",
+    "    if test_image_bytes is None:\n",
+    "        print('Reason: No test image available')\n",
+    "    elif 'response' not in locals():\n",
+    "        print('Reason: No similarity search was performed')\n",
+    "    elif not response.get('vectors'):\n",
+    "        print('Reason: No similar images found in vector store')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Summary and Next Steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== Few-shot Dataset Import Summary ===\")\n",
+    "print(f\"✅ Dataset: FCC Invoices (REALKIE)\")\n",
+    "print(f\"✅ Samples processed: {samples_to_process - len(failed_samples) if 'samples_to_process' in locals() and 'failed_samples' in locals() else 'N/A'}\")\n",
+    "print(f\"✅ S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n",
+    "print(f\"✅ S3 Vectors Index: {S3_VECTORS_INDEX}\")\n",
+    "print(f\"✅ Images stored in: s3://{GENAIIDP_S3_WORKING_BUCKET}/fcc_invoices/\")\n",
+    "print(f\"✅ Embedding Model: {EMBEDDING_MODEL_ID}\")\n",
+    "print(f\"✅ Similarity search verified\")\n",
+    "\n",
+    "print(\"\\n=== Next Steps ===\")\n",
+    "print(\"1. ✅ Updated attributes mapping to match actual FCC invoices dataset structure\")\n",
+    "print(\"2. ✅ Added ground truth label parsing from CSV data\")\n",
+    "print(\"3. Configure your IDP extraction to use the dynamic few-shot Lambda ARN\")\n",
+    "print(\"4. Test document processing with few-shot examples!\")\n",
+    "print(\"5. Fine-tune the label parsing logic if needed based on your specific use case\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From c8b3b2490e4213d4c58de4f424185504f4a7eb8a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:30:37 +0100
Subject: [PATCH 23/39] chore: use custom_prompt_lambda_arn parameter

---
 .../step3_extraction_with_dynamic_few_shot.ipynb | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
index 5d5a0663..88879c64 100644
--- a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
+++ b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
@@ -14,13 +14,13 @@
     "- Handle errors and monitor performance\n",
     "\n",
     "**Prerequisites:**\n",
-    "- Completed Step 2 (Classification)\n",
+    "- Completed Step 2 (Classification) (`notebooks/fewshot_dataset_import.ipynb`)\n",
     "- AWS Lambda permissions to create/invoke functions\n",
     "- Dynamic few-shot Lambda function deployed\n",
-    "- S3 Vectors index populated with examples (`notebooks/misc/fewshot_dataset_import.ipynb`)\n",
+    "- S3 Vectors index populated with examples (`notebooks/fewshot_dataset_import.ipynb`)\n",
     "\n",
     "**Key Feature:**\n",
-    "The `dynamic_few_shot_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting."
+    "The `custom_prompt_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting."
    ]
   },
   {
@@ -71,7 +71,7 @@
    "outputs": [],
    "source": [
     "# Load document from previous step\n",
-    "classification_data_dir = Path(\".data/step2_classification\")\n",
+    "classification_data_dir = Path(\"../../../notebooks/examples/.data/step2_classification\")\n",
     "\n",
     "# Load document object from JSON\n",
     "document_path = classification_data_dir / \"document.json\"\n",
@@ -174,12 +174,12 @@
    "source": [
     "# Create configuration WITHOUT dynamic few-shot Lambda\n",
     "config_default = CONFIG.copy()\n",
-    "if 'dynamic_few_shot_lambda_arn' in config_default.get('extraction', {}):\n",
-    "    del config_default['extraction']['dynamic_few_shot_lambda_arn']\n",
+    "if 'custom_prompt_lambda_arn' in config_default.get('extraction', {}):\n",
+    "    del config_default['extraction']['custom_prompt_lambda_arn']\n",
     "\n",
     "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n",
     "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n",
-    "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('dynamic_few_shot_lambda_arn', 'None')}\")\n",
+    "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('custom_prompt_lambda_arn', 'None')}\")\n",
     "\n",
     "# Create extraction service with default config\n",
     "extraction_service_default = extraction.ExtractionService(config=config_default)\n",
@@ -259,7 +259,7 @@
     "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n",
     "    # Create configuration WITH dynamic few-shot Lambda\n",
     "    config_few_shot = CONFIG.copy()\n",
-    "    config_few_shot['extraction']['dynamic_few_shot_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n",
+    "    config_few_shot['extraction']['custom_prompt_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n",
     "    \n",
     "    print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n",
     "    print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n",

From 41b2a579810c2354b72f9372d35bac678ddc6dcc Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:30:59 +0100
Subject: [PATCH 24/39] chore: add classes configuration for step-by-step
 example

---
 .../notebooks/config/classes.yaml             | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml

diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml
new file mode 100644
index 00000000..e19746f9
--- /dev/null
+++ b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml
@@ -0,0 +1,119 @@
+classes:
+  - $schema: https://json-schema.org/draft/2020-12/schema
+    $defs:
+      LineItem:
+        type: object
+        properties:
+          LineItemEndDate:
+            default: 'null'
+            x-aws-idp-confidence-threshold: '0.8'
+            examples:
+              - 11/06/2012
+            data_type: string
+            format: date
+            description: End date for each line item (typically in MM/DD/YY format)
+            type: string
+            x-aws-idp-evaluation-method: LEVENSHTEIN
+            x-aws-idp-evaluation-threshold: '0.7'
+          LineItemDescription:
+            data_type: string
+            description: Description of the line item
+            type: string
+            x-aws-idp-evaluation-method: LEVENSHTEIN
+            x-aws-idp-evaluation-threshold: '0.7'
+          LineItemStartDate:
+            default: 'null'
+            x-aws-idp-confidence-threshold: '0.8'
+            examples:
+              - 11/06/2012
+            data_type: string
+            format: date
+            description: Start date for each line item (typically in MM/DD/YY format)
+            type: string
+            x-aws-idp-evaluation-method: LEVENSHTEIN
+            x-aws-idp-evaluation-threshold: '0.7'
+          LineItemDays:
+            maxItems: '7'
+            x-aws-idp-confidence-threshold: '0.8'
+            uniqueItems: true
+            description: List of days of the week for the line item
+            type: array
+            items:
+              type: string
+              data_type: string
+              enum:
+                - M
+                - T
+                - W
+                - Th
+                - F
+                - S
+                - Su
+            x-aws-idp-evaluation-method: EXACT
+            x-aws-idp-evaluation-threshold: '0.7'
+          LineItemRate:
+            data_type: string
+            description: Rate of the line item
+            x-aws-idp-confidence-threshold: '0.8'
+            type: number
+            x-aws-idp-evaluation-method: NUMERIC_EXACT
+    description: Invoice document
+    type: object
+    x-aws-idp-document-type: Invoice
+    properties:
+      LineItems:
+        type: array
+        description: List of line items in the invoice
+        items:
+          $ref: '#/$defs/LineItem'
+      Agency:
+        x-aws-idp-confidence-threshold: '0.8'
+        data_type: string
+        description: The advertising agency or station. May be labelled Agency, or Station.
+        x-aws-idp-evaluation-weight: '2'
+        type: string
+        x-aws-idp-evaluation-method: LEVENSHTEIN
+        x-aws-idp-evaluation-threshold: '0.7'
+      Advertiser:
+        x-aws-idp-confidence-threshold: '0.8'
+        data_type: string
+        description: The political advertiser or campaign purchasing the broadcast time
+        x-aws-idp-evaluation-weight: '2'
+        type: string
+        x-aws-idp-evaluation-method: FUZZY
+        x-aws-idp-evaluation-threshold: '0.8'
+      GrossTotal:
+        data_type: string
+        description: >-
+          The total gross amount for all line items before any discounts or
+          adjustments
+        x-aws-idp-evaluation-weight: '2'
+        x-aws-idp-confidence-threshold: '0.8'
+        type: number
+        x-aws-idp-evaluation-method: NUMERIC_EXACT
+      PaymentTerms:
+        examples:
+          - Net 30
+        data_type: string
+        description: Payment terms
+        x-aws-idp-evaluation-weight: '0.2'
+        type: string
+        x-aws-idp-evaluation-method: FUZZY
+        x-aws-idp-evaluation-threshold: '0.7'
+      AgencyCommission:
+        data_type: string
+        description: Agency commission
+        x-aws-idp-evaluation-weight: '0.2'
+        x-aws-idp-confidence-threshold: '0.8'
+        type: number
+        x-aws-idp-evaluation-method: NUMERIC_EXACT
+      NetAmountDue:
+        data_type: string
+        description: >-
+          The final net amount due after any discounts or adjustments have been
+          applied (stored as string with commas)
+        x-aws-idp-evaluation-weight: '2'
+        x-aws-idp-confidence-threshold: '0.8'
+        type: number
+        x-aws-idp-evaluation-method: NUMERIC_EXACT
+    $id: Invoice

From 0c2b1055af2717130f1ce782ff7dd425bae3d5a6 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Fri, 12 Dec 2025 17:40:48 +0100
Subject: [PATCH 25/39] chore: remove step-by-step extraction notebook

---
 .../notebooks/config/classes.yaml             | 119 -----
 .../config/extraction_with_few_shot.yaml      | 101 -----
 ...ep3_extraction_with_dynamic_few_shot.ipynb | 420 ------------------
 3 files changed, 640 deletions(-)
 delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml
 delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml
 delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb

diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml
deleted file mode 100644
index e19746f9..00000000
--- a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-classes:
-  - $schema: https://json-schema.org/draft/2020-12/schema
-    $defs:
-      LineItem:
-        type: object
-        properties:
-          LineItemEndDate:
-            default: 'null'
-            x-aws-idp-confidence-threshold: '0.8'
-            examples:
-              - 11/06/2012
-            data_type: string
-            format: date
-            description: End date for each line item (typically in MM/DD/YY format)
-            type: string
-            x-aws-idp-evaluation-method: LEVENSHTEIN
-            x-aws-idp-evaluation-threshold: '0.7'
-          LineItemDescription:
-            data_type: string
-            description: Description of the line item
-            type: string
-            x-aws-idp-evaluation-method: LEVENSHTEIN
-            x-aws-idp-evaluation-threshold: '0.7'
-          LineItemStartDate:
-            default: 'null'
-            x-aws-idp-confidence-threshold: '0.8'
-            examples:
-              - 11/06/2012
-            data_type: string
-            format: date
-            description: Start date for each line item (typically in MM/DD/YY format)
-            type: string
-            x-aws-idp-evaluation-method: LEVENSHTEIN
-            x-aws-idp-evaluation-threshold: '0.7'
-          LineItemDays:
-            maxItems: '7'
-            x-aws-idp-confidence-threshold: '0.8'
-            uniqueItems: true
-            description: List of days of the week for the line item
-            type: array
-            items:
-              type: string
-              data_type: string
-              enum:
-                - M
-                - T
-                - W
-                - Th
-                - F
-                - S
-                - Su
-            x-aws-idp-evaluation-method: EXACT
-            x-aws-idp-evaluation-threshold: '0.7'
-          LineItemRate:
-            data_type: string
-            description: Rate of the line item
-            x-aws-idp-confidence-threshold: '0.8'
-            type: number
-            x-aws-idp-evaluation-method: NUMERIC_EXACT
-    description: Invoice document
-    type: object
-    x-aws-idp-document-type: Invoice
-    properties:
-      LineItems:
-        type: array
-        description: List of line items in the invoice
-        items:
-          $ref: '#/$defs/LineItem'
-      Agency:
-        x-aws-idp-confidence-threshold: '0.8'
-        data_type: string
-        description: The advertising agency or station. May be labelled Agency, or Station.
-        x-aws-idp-evaluation-weight: '2'
-        type: string
-        x-aws-idp-evaluation-method: LEVENSHTEIN
-        x-aws-idp-evaluation-threshold: '0.7'
-      Advertiser:
-        x-aws-idp-confidence-threshold: '0.8'
-        data_type: string
-        description: The political advertiser or campaign purchasing the broadcast time
-        x-aws-idp-evaluation-weight: '2'
-        type: string
-        x-aws-idp-evaluation-method: FUZZY
-        x-aws-idp-evaluation-threshold: '0.8'
-      GrossTotal:
-        data_type: string
-        description: >-
-          The total gross amount for all line items before any discounts or
-          adjustments
-        x-aws-idp-evaluation-weight: '2'
-        x-aws-idp-confidence-threshold: '0.8'
-        type: number
-        x-aws-idp-evaluation-method: NUMERIC_EXACT
-      PaymentTerms:
-        examples:
-          - Net 30
-        data_type: string
-        description: Payment terms
-        x-aws-idp-evaluation-weight: '0.2'
-        type: string
-        x-aws-idp-evaluation-method: FUZZY
-        x-aws-idp-evaluation-threshold: '0.7'
-      AgencyCommission:
-        data_type: string
-        description: Agency commission
-        x-aws-idp-evaluation-weight: '0.2'
-        x-aws-idp-confidence-threshold: '0.8'
-        type: number
-        x-aws-idp-evaluation-method: NUMERIC_EXACT
-      NetAmountDue:
-        data_type: string
-        description: >-
-          The final net amount due after any discounts or adjustments have been
-          applied (stored as string with commas)
-        x-aws-idp-evaluation-weight: '2'
-        x-aws-idp-confidence-threshold: '0.8'
-        type: number
-        x-aws-idp-evaluation-method: NUMERIC_EXACT
-    $id: Invoice
diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml
deleted file mode 100644
index addd9a01..00000000
--- a/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Extraction Service Configuration
-extraction:
-  top_p: '0.1'
-  max_tokens: '4096'
-  top_k: '5'
-  temperature: '0.0'
-  model: us.amazon.nova-pro-v1:0
-  system_prompt: >-
-    You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided.
-  task_prompt: >-
-    <background>
-
-    You are an expert in document analysis and information extraction.
-    You can understand and extract key information from documents classified as type
-
-    {DOCUMENT_CLASS}.
-
-    </background>
-
-
-    <task>
-
-    Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. 
-    Then, extract the relevant information from the text and populate the corresponding values in the JSON object.
-
-    </task>
-
-
-    <extraction-guidelines>
-
-    Guidelines:
-        1. Ensure that the data is accurately represented and properly formatted within
-        the JSON structure
-        2. Include double quotes around all keys and values
-        3. Do not make up data - only extract information explicitly found in the
-        document
-        4. Do not use /n for new lines, use a space instead
-        5. If a field is not found or if unsure, return null
-        6. All dates should be in MM/DD/YYYY format
-        7. Do not perform calculations or summations unless totals are explicitly given
-        8. If an alias is not found in the document, return null
-        9. Guidelines for checkboxes:
-         9.A. CAREFULLY examine each checkbox, radio button, and selection field:
-            - Look for marks like ✓, ✗, x, filled circles (●), darkened areas, or handwritten checks indicating selection
-            - For checkboxes and multi-select fields, ONLY INCLUDE options that show clear visual evidence of selection
-            - DO NOT list options that have no visible selection mark
-         9.B. For ambiguous or overlapping tick marks:
-            - If a mark overlaps between two or more checkboxes, determine which option contains the majority of the mark
-            - Consider a checkbox selected if the mark is primarily inside the check box or over the option text
-            - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward.
-            - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment.
-        10. Think step by step first and then answer.
-
-    </extraction-guidelines>
-
-    If the attributes section below contains a list of attribute names and
-    descriptions, then output only those attributes, using the provided
-    descriptions as guidance for finding the correct values.
-
-    <attributes>
-
-    {ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
-
-    </attributes>
-
-    <few-shot-examples>
-
-    {FEW_SHOT_EXAMPLES}
-
-    </few-shot-examples>
-
-    <<CACHEPOINT>>
-
-
-    <document-text>
-
-    {DOCUMENT_TEXT}
-
-    </document-text>
-
-
-    <document_image>
-
-    {DOCUMENT_IMAGE}
-
-    </document_image>
-
-
-    <final-instructions>
-
-    Extract key information from the document and return a JSON object with the following key steps:
-    1. Carefully analyze the document text to identify the requested attributes
-    2. Extract only information explicitly found in the document - never make up data
-    3. Format all dates as MM/DD/YYYY and replace newlines with spaces
-    4. For checkboxes, only include options with clear visual selection marks
-    5. Use null for any fields not found in the document
-    6. Ensure the output is properly formatted JSON with quoted keys and values
-    7. Think step by step before finalizing your answer
-
-    </final-instructions>
-
diff --git a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
deleted file mode 100644
index 88879c64..00000000
--- a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb
+++ /dev/null
@@ -1,420 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Step 3: Dynamic Few-Shot Information Extraction\n",
-    "\n",
-    "This notebook demonstrates the **dynamic few-shot prompting feature** for Pattern 2. It shows how to:\n",
-    "\n",
-    "- Configure dynamic few-shot Lambda functions extraction\n",
-    "- Compare default vs examples-enhanced extraction results\n",
-    "- Inspect Lambda payloads and responses\n",
-    "- Handle errors and monitor performance\n",
-    "\n",
-    "**Prerequisites:**\n",
-    "- Completed Step 2 (Classification) (`notebooks/fewshot_dataset_import.ipynb`)\n",
-    "- AWS Lambda permissions to create/invoke functions\n",
-    "- Dynamic few-shot Lambda function deployed\n",
-    "- S3 Vectors index populated with examples (`notebooks/fewshot_dataset_import.ipynb`)\n",
-    "\n",
-    "**Key Feature:**\n",
-    "The `custom_prompt_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Setup and Import Libraries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import json\n",
-    "import time\n",
-    "import logging\n",
-    "import boto3\n",
-    "from pathlib import Path\n",
-    "import yaml\n",
-    "\n",
-    "# Import IDP libraries\n",
-    "from idp_common.models import Document, Status\n",
-    "from idp_common.s3 import get_json_content\n",
-    "from idp_common import extraction\n",
-    "\n",
-    "# Configure logging to see Lambda invocation details\n",
-    "logging.basicConfig(level=logging.INFO)\n",
-    "logging.getLogger('idp_common.extraction').setLevel(logging.INFO)\n",
-    "logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)\n",
-    "\n",
-    "print(\"Libraries imported successfully\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Load Previous Step Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load document from previous step\n",
-    "classification_data_dir = Path(\"../../../notebooks/examples/.data/step2_classification\")\n",
-    "\n",
-    "# Load document object from JSON\n",
-    "document_path = classification_data_dir / \"document.json\"\n",
-    "with open(document_path, 'r') as f:\n",
-    "    document = Document.from_json(f.read())\n",
-    "\n",
-    "# Load configuration directly from config files\n",
-    "config_dir = Path(\"config\")\n",
-    "CONFIG = {}\n",
-    "\n",
-    "# Load each configuration file\n",
-    "config_files = [\n",
-    "    \"extraction_with_few_shot.yaml\",\n",
-    "    \"classes.yaml\"\n",
-    "]\n",
-    "\n",
-    "for config_file in config_files:\n",
-    "    config_path = config_dir / config_file\n",
-    "    if config_path.exists():\n",
-    "        with open(config_path, 'r') as f:\n",
-    "            file_config = yaml.safe_load(f)\n",
-    "            CONFIG.update(file_config)\n",
-    "        print(f\"Loaded {config_file}\")\n",
-    "    else:\n",
-    "        print(f\"Warning: {config_file} not found\")\n",
-    "\n",
-    "# Load environment info\n",
-    "env_path = classification_data_dir / \"environment.json\"\n",
-    "with open(env_path, 'r') as f:\n",
-    "    env_info = json.load(f)\n",
-    "\n",
-    "# Set environment variables\n",
-    "os.environ['AWS_REGION'] = env_info['region']\n",
-    "os.environ['METRIC_NAMESPACE'] = 'IDP-Dynamic-Few-Shot'\n",
-    "\n",
-    "print(f\"Loaded document: {document.id}\")\n",
-    "print(f\"Document status: {document.status.value}\")\n",
-    "print(f\"Number of sections: {len(document.sections) if document.sections else 0}\")\n",
-    "print(f\"Loaded configuration sections: {list(CONFIG.keys())}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Configure Dynamic Few-Shot Lambda ARN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 🔧 CONFIGURATION: Set your dynamic few-shot Lambda ARN here\n",
-    "# Replace with your actual Lambda function ARN for live testing\n",
-    "\n",
-    "# Check if dynamic few-shot Lambda function exists\n",
-    "lambda_client = boto3.client('lambda')\n",
-    "DYNAMIC_FEW_SHOT_LAMBDA_ARN = None\n",
-    "\n",
-    "try:\n",
-    "    response = lambda_client.get_function(FunctionName='GENAIIDP-dynamic-few-shot')\n",
-    "    DYNAMIC_FEW_SHOT_LAMBDA_ARN = response['Configuration']['FunctionArn']\n",
-    "    print(f\"✅ Found dynamic few-shot Lambda function: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
-    "except lambda_client.exceptions.ResourceNotFoundException:\n",
-    "    print(\"⚠️  Dynamic Few-Shot Lambda function not found: GENAIIDP-dynamic-few-shot\")\n",
-    "    print(\"💡 Deploy using: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n",
-    "except Exception as e:\n",
-    "    print(f\"Error checking Lambda function: {e}\")\n",
-    "\n",
-    "if not DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n",
-    "    print(\"⚠️  No dynamic few-shot Lambda ARN configured\")\n",
-    "    print(\"💡 This demo will show standard extraction without few-shot examples\")\n",
-    "    print(\"🔧 To test with examples, deploy the dynamic few-shot Lambda first\")\n",
-    "else:\n",
-    "    print(f\"✅ Dynamic few-shot Lambda ARN configured: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
-    "    print(\"🚀 This demo will use few-shot examples from S3 Vectors\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Extraction Comparison: Default vs Dynamic Few-Shot"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 4.1 Default Extraction (Without Dynamic Few-Shot)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create configuration WITHOUT dynamic few-shot Lambda\n",
-    "config_default = CONFIG.copy()\n",
-    "if 'custom_prompt_lambda_arn' in config_default.get('extraction', {}):\n",
-    "    del config_default['extraction']['custom_prompt_lambda_arn']\n",
-    "\n",
-    "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n",
-    "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n",
-    "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('custom_prompt_lambda_arn', 'None')}\")\n",
-    "\n",
-    "# Create extraction service with default config\n",
-    "extraction_service_default = extraction.ExtractionService(config=config_default)\n",
-    "print(\"\\n✅ Default extraction service initialized\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run default extraction on first section\n",
-    "if document.sections:\n",
-    "    first_section = document.sections[0]\n",
-    "    print(f\"🔄 Processing section {first_section.section_id} with DEFAULT prompts\")\n",
-    "    print(f\"Classification: {first_section.classification}\")\n",
-    "    print(f\"Pages: {first_section.page_ids}\")\n",
-    "    \n",
-    "    # Save original document state\n",
-    "    document_default = Document.from_json(document.to_json())\n",
-    "    \n",
-    "    # Process with default extraction\n",
-    "    start_time = time.time()\n",
-    "    document_default = extraction_service_default.process_document_section(\n",
-    "        document=document_default,\n",
-    "        section_id=first_section.section_id\n",
-    "    )\n",
-    "    default_extraction_time = time.time() - start_time\n",
-    "    \n",
-    "    print(f\"✅ Default extraction completed in {default_extraction_time:.2f} seconds\")\n",
-    "\n",
-    "    # Store results for comparison\n",
-    "    default_section_result = None\n",
-    "    for section in document_default.sections:\n",
-    "        if section.section_id == first_section.section_id:\n",
-    "            default_section_result = section\n",
-    "            break\n",
-    "            \n",
-    "else:\n",
-    "    print(\"⚠️ No sections found in document\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Show section extraction result\n",
-    "if default_section_result:\n",
-    "    print(f\"\\nSection {default_section_result.section_id} extraction result:\")\n",
-    "    extraction_result_uri = default_section_result.extraction_result_uri\n",
-    "\n",
-    "    if extraction_result_uri:\n",
-    "        result = get_json_content(extraction_result_uri)\n",
-    "        result_json = json.dumps(result[\"inference_result\"], indent=2)\n",
-    "        print(result_json)\n",
-    "\n",
-    "else:\n",
-    "    print(\"⚠️ No sections found in document\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 4.2 Dynamic Few-Shot Extraction using Lambda"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n",
-    "    # Create configuration WITH dynamic few-shot Lambda\n",
-    "    config_few_shot = CONFIG.copy()\n",
-    "    config_few_shot['extraction']['custom_prompt_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n",
-    "    \n",
-    "    print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n",
-    "    print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n",
-    "    print(f\"Dynamic Few-Shot Lambda: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n",
-    "    print(f\"Lambda Function Name: {DYNAMIC_FEW_SHOT_LAMBDA_ARN.split(':')[-1]}\")\n",
-    "    \n",
-    "    # Create extraction service with dynamic few-shot config\n",
-    "    extraction_service_few_shot = extraction.ExtractionService(config=config_few_shot)\n",
-    "    \n",
-    "    print(\"\\n✅ Dynamic few-shot extraction service initialized\")\n",
-    "    \n",
-    "else:\n",
-    "    print(\"⚠️ No dynamic few-shot Lambda ARN configured - skipping demonstration\")\n",
-    "    config_few_shot = None\n",
-    "    extraction_service_few_shot = None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run dynamic few-shot extraction on first section\n",
-    "if DYNAMIC_FEW_SHOT_LAMBDA_ARN and document.sections:\n",
-    "    first_section = document.sections[0]\n",
-    "    print(f\"🔄 Processing section {first_section.section_id} with DYNAMIC FEW-SHOT\")\n",
-    "    print(f\"Classification: {first_section.classification}\")\n",
-    "    print(f\"Pages: {first_section.page_ids}\")\n",
-    "    \n",
-    "    # Create fresh document copy for examples processing\n",
-    "    document_few_shot = Document.from_json(document.to_json())\n",
-    "    \n",
-    "    # Process with dynamic few-shot extraction\n",
-    "    start_time = time.time()\n",
-    "    \n",
-    "    try:\n",
-    "        document_few_shot = extraction_service_few_shot.process_document_section(\n",
-    "            document=document_few_shot,\n",
-    "            section_id=first_section.section_id\n",
-    "        )\n",
-    "        few_shot_extraction_time = time.time() - start_time\n",
-    "        \n",
-    "        print(f\"✅ Dynamic few-shot extraction completed in {few_shot_extraction_time:.2f} seconds\")\n",
-    "        \n",
-    "        # Store results for comparison\n",
-    "        few_shot_section_result = None\n",
-    "        for section in document_few_shot.sections:\n",
-    "            if section.section_id == first_section.section_id:\n",
-    "                few_shot_section_result = section\n",
-    "                break\n",
-    "                \n",
-    "        # Performance comparison\n",
-    "        overhead = few_shot_extraction_time - default_extraction_time\n",
-    "        print(f\"\\n📊 Performance Comparison:\")\n",
-    "        print(f\"   Default: {default_extraction_time:.2f}s\")\n",
-    "        print(f\"   Dynamic Few-Shot: {few_shot_extraction_time:.2f}s\")\n",
-    "        print(f\"   Dynamic Few-Shot Overhead: {overhead:.2f}s ({overhead/default_extraction_time*100:.1f}% increase)\")\n",
-    "        \n",
-    "    except Exception as e:\n",
-    "        print(f\"❌ Dynamic few-shot extraction failed: {e}\")\n",
-    "        print(\"\\n🔍 This demonstrates the fail-fast error handling behavior\")\n",
-    "        few_shot_section_result = None\n",
-    "        few_shot_extraction_time = None\n",
-    "        \n",
-    "else:\n",
-    "    print(\"⚠️ Skipping dynamic few-shot extraction (no Lambda configured or no sections)\")\n",
-    "    document_few_shot = None\n",
-    "    few_shot_section_result = None\n",
-    "    few_shot_extraction_time = None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Show section extraction result\n",
-    "if few_shot_section_result:\n",
-    "    print(f\"\\nSection {few_shot_section_result.section_id} extraction result:\")\n",
-    "    extraction_result_uri = few_shot_section_result.extraction_result_uri\n",
-    "\n",
-    "    if extraction_result_uri:\n",
-    "        result = get_json_content(extraction_result_uri)\n",
-    "        result_json = json.dumps(result[\"inference_result\"], indent=2)\n",
-    "        print(result_json)\n",
-    "\n",
-    "else:\n",
-    "    print(\"⚠️ No sections found in document\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Results and Summary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"=== DEMO COMPLETE: SUMMARY ===\")\n",
-    "\n",
-    "sections_processed = 1 if document.sections else 0\n",
-    "dynamic_few_shot_used = DYNAMIC_FEW_SHOT_LAMBDA_ARN is not None\n",
-    "\n",
-    "print(f\"\\n✅ DEMO RESULTS:\")\n",
-    "print(f\"   📄 Document processed: {document.id}\")\n",
-    "print(f\"   📊 Sections processed: {sections_processed}\")\n",
-    "print(f\"   🔧 Dynamic Few-Shot used: {'Yes' if dynamic_few_shot_used else 'No'}\")\n",
-    "\n",
-    "if dynamic_few_shot_used and 'few_shot_extraction_time' in locals() and examples_extraction_time:\n",
-    "    print(f\"   ⏱️  Performance overhead: {few_shot_extraction_time - default_extraction_time:.2f}s\")\n",
-    "    print(f\"   📈 Accuracy improvement: Enhanced with few-shot examples\")\n",
-    "\n",
-    "print(f\"\\n🚀 TO IMPLEMENT DYNAMIC FEW-SHOT IN PRODUCTION:\")\n",
-    "print(f\"   1. 📝 Deploy dynamic few-shot Lambda stack\")\n",
-    "print(f\"   2. 📊 Populate S3 Vectors index with example documents\")\n",
-    "print(f\"   3. ⚙️  Add 'dynamic_few_shot_lambda_arn' to extraction config\")\n",
-    "print(f\"   4. 🧪 Test with your actual documents and use cases\")\n",
-    "print(f\"   5. 📊 Monitor CloudWatch logs for performance and accuracy\")\n",
-    "\n",
-    "print(f\"\\n📚 RESOURCES:\")\n",
-    "print(f\"   📖 Documentation: notebooks/examples/dynamic-few-shot-lambda/README.md\")\n",
-    "print(f\"   🔧 Lambda Function: notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py\")\n",
-    "print(f\"   ☁️  Deploy: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n",
-    "print(f\"   📊 Import Dataset: notebooks/misc/fewshot_dataset_import.ipynb\")\n",
-    "\n",
-    "print(f\"\\n📌 CONTINUE TO: step4_assessment.ipynb\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From 2d630ad4aa0b6bcb8de109795cbbdfbc8e508fde Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 16 Dec 2025 15:58:45 +0100
Subject: [PATCH 26/39] chore: fix step 3 extraction instructions

---
 plugins/dynamic-few-shot-lambda/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md
index d38e5384..ec58b3f6 100644
--- a/plugins/dynamic-few-shot-lambda/README.md
+++ b/plugins/dynamic-few-shot-lambda/README.md
@@ -115,7 +115,7 @@ extraction:
 ### Step 5: Run the Demo Notebook
 
 0. Run `notebooks/examples` steps 0, 1, 2
-1. Open `plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_custom_lambda.ipynb`
+1. Open `notebooks/examples/step3_extraction_with_custom_lambda.ipynb`. In section 3, set `DEMO_LAMBDA_ARN` to `arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot`
 2. Run all cells to see the comparison
 
 ## Lambda Interface

From daf70290d97e5a039b881628a560f8ab373ca9aa Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 16 Dec 2025 17:30:15 +0100
Subject: [PATCH 27/39] chore: cfn_nag allow * resource on its permissions
 policy

---
 plugins/dynamic-few-shot-lambda/template.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 71f47f81..b0dc8f6f 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -78,6 +78,8 @@ Resources:
             reason: "Demo function - does not require reserved concurrency as it scales based on demand"
           - id: W58
             reason: "Demo function - DLQ not required"
+          - id: W11
+            reason: "Demo function - allow * resource on its permissions policy"
     # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries"
     # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs"
     # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand"

From f1ec3b9e355ea9bac2ef27dc25949447d6743594 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 16 Dec 2025 17:33:24 +0100
Subject: [PATCH 28/39] chore: validation for LogLevel

---
 plugins/dynamic-few-shot-lambda/template.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index b0dc8f6f..50959bb6 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -46,9 +46,16 @@ Parameters:
     Type: String
     Default: "GENAIIDP-dynamic-few-shot"
 
+  # Logging configuration
   LogLevel:
     Type: String
     Default: INFO
+    AllowedValues:
+      - DEBUG
+      - INFO
+      - WARN
+      - ERROR
+    Description: Default logging level
 
   GenAIIDPS3OutputBucketName:
     Type: String

From b88ace71aca52164aabe6ebdafd09e784835c184 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 16 Dec 2025 17:34:49 +0100
Subject: [PATCH 29/39] chore: make LogRetentionDays as parameter

---
 plugins/dynamic-few-shot-lambda/template.yml | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 50959bb6..895f5d76 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -57,6 +57,31 @@ Parameters:
       - ERROR
     Description: Default logging level
 
+  LogRetentionDays:
+    Type: Number
+    Default: 30
+    Description: Number of days to retain CloudWatch logs
+    AllowedValues:
+      [
+        1,
+        3,
+        5,
+        7,
+        14,
+        30,
+        60,
+        90,
+        120,
+        150,
+        180,
+        365,
+        400,
+        545,
+        731,
+        1827,
+        3653,
+      ]
+
   GenAIIDPS3OutputBucketName:
     Type: String
     Description: "GenAIIDP S3OutputBucketName"
@@ -157,7 +182,7 @@ Resources:
     # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
     Properties:
       LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}"
-      RetentionInDays: 7  # Short retention for demo purposes
+      RetentionInDays: !Ref LogRetentionDays
 
   DynamicFewShotVectorBucket:
     Type: AWS::S3Vectors::VectorBucket

From d278154db48b97273d604e9f400c4358a8d952b3 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Tue, 16 Dec 2025 17:36:52 +0100
Subject: [PATCH 30/39] chore: use KMS key for log group

---
 plugins/dynamic-few-shot-lambda/template.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 895f5d76..77bc8b6c 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -174,15 +174,10 @@ Resources:
 
   DynamicFewShotLogGroup:
     Type: AWS::Logs::LogGroup
-    Metadata:
-      cfn_nag:
-        rules_to_suppress:
-          - id: W84
-            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
-    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
     Properties:
       LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}"
       RetentionInDays: !Ref LogRetentionDays
+      KmsKeyId: !GetAtt GenAIIDPCustomerManagedEncryptionKeyArn
 
   DynamicFewShotVectorBucket:
     Type: AWS::S3Vectors::VectorBucket

From 0115242f7389dff9322d477acaaaa6ec4ed0519d Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 11:58:13 +0100
Subject: [PATCH 31/39] chore: make bucket creation optional, add KMS key, add
 dataset bucket

---
 plugins/dynamic-few-shot-lambda/template.yml | 224 +++++++++++++++----
 1 file changed, 180 insertions(+), 44 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 77bc8b6c..ca57fa63 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -18,19 +18,27 @@ Parameters:
 
   VectorBucketName:
     Type: String
-    Default: "genaiidp-dynamic-few-shot"
+    Default: ""
+    Description: >-
+      (Optional) Existing S3 vectors bucket used. Provide the name of an existing S3 vectors
+      bucket here or leave blank to automatically create a new S3 vectors bucket.
 
   VectorIndexName:
     Type: String
-    Default: "documents"
-
-  VectorDimensions:
-    Type: Number
-    Default: 3072
+    Default: ""
+    Description: >-
+      (Optional) Existing S3 vectors index used. Provide the name of an existing S3 vectors
+      index here or leave blank to automatically create a new S3 vectors index.
 
   ModelId:
     Type: String
     Default: "amazon.nova-2-multimodal-embeddings-v1:0"
+    Description: Vector embedding model to use to create meaningful vector representations of documents
+
+  VectorDimensions:
+    Type: Number
+    Default: 3072
+    Description: Vector embedding length to use, as defined by the embedding model in use
 
   TopK:
     Type: Number
@@ -44,7 +52,14 @@ Parameters:
 
   LambdaFunctionName:
     Type: String
-    Default: "GENAIIDP-dynamic-few-shot"
+    Default: "IDP-dynamic-few-shot"
+
+  DatasetBucketName:
+    Type: String
+    Default: ""
+    Description: >-
+      (Optional) Existing bucket used for dynamic few-shot datasets. Provide the name of
+      an existing bucket here or leave blank to automatically create a new bucket.
 
   # Logging configuration
   LogLevel:
@@ -82,20 +97,16 @@ Parameters:
         3653,
       ]
 
-  GenAIIDPS3OutputBucketName:
-    Type: String
-    Description: "GenAIIDP S3OutputBucketName"
-
-  GenAIIDPS3WorkingBucketName:
-    Type: String
-    Description: "GenAIIDP WorkingBucket Name"
-
-  GenAIIDPCustomerManagedEncryptionKeyArn:
+  # GenAI IDP parameters
+  IDPS3LoggingBucketName:
     Type: String
-    Description: "GenAIIDP CustomerManagedEncryptionKey ARN"
+    Description: "IDP LoggingBucket Name"
 
 Conditions:
   HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]]
+  ShouldCreateVectorBucket: !Equals [ !Ref VectorBucketName, "" ]
+  ShouldCreateVectorIndex: !Equals [ !Ref VectorIndexName, "" ]
+  ShouldCreateDatasetBucket: !Equals [ !Ref DatasetBucketName, "" ]
 
 Resources:
 
@@ -126,12 +137,22 @@ Resources:
         - arm64
       Timeout: 300
       MemorySize: 512
-      Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting
+      Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors
       Environment:
         Variables:
           LOG_LEVEL: !Ref LogLevel
-          S3VECTOR_BUCKET: !Ref VectorBucketName
-          S3VECTOR_INDEX: !Ref VectorIndexName
+          S3VECTOR_BUCKET: !If
+            - ShouldCreateVectorBucket
+            # Error: Requested attribute VectorBucketName must be a readonly property in schema for AWS::S3Vectors::VectorBucket
+            # - !GetAtt DynamicFewShotVectorBucket.VectorBucketName
+            - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]]
+            - !Ref VectorBucketName
+          S3VECTOR_INDEX: !If
+            - ShouldCreateVectorIndex
+            # Error: Requested attribute IndexName must be a readonly property in schema for AWS::S3Vectors::Index
+            # - !GetAtt DocumentsIndex.IndexName
+            - !Select [3, !Split ["/", !Ref DocumentsIndex]]
+            - !Ref VectorIndexName
           S3VECTOR_DIMENSIONS: !Ref VectorDimensions
           MODEL_ID: !Ref ModelId
           TOP_K: !Ref TopK
@@ -142,9 +163,10 @@ Resources:
       Policies:
         - AWSLambdaBasicExecutionRole
         - S3ReadPolicy:
-            BucketName: !Ref GenAIIDPS3OutputBucketName
-        - S3ReadPolicy:
-            BucketName: !Ref GenAIIDPS3WorkingBucketName
+            BucketName: !If
+              - ShouldCreateDatasetBucket
+              - !Ref DatasetBucket
+              - !Ref DatasetBucketName
         - Statement:
             - Effect: Allow
               Action: cloudwatch:PutMetricData
@@ -161,7 +183,13 @@ Resources:
                 - s3vectors:GetVectors
                 - s3vectors:QueryVectors
               Resource:
-                - !Ref DynamicFewShotVectorIndex
+                - !If
+                  - ShouldCreateVectorIndex
+                  - !Ref DocumentsIndex
+                  - !If
+                    - ShouldCreateVectorBucket
+                    - !Sub "${DynamicFewShotVectorBucket}/index/${DocumentsIndex}"
+                    - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}"
             - Effect: Allow
               Action:
                 - kms:Encrypt
@@ -170,32 +198,27 @@ Resources:
                 - kms:GenerateDataKey*
                 - kms:DescribeKey
               Resource:
-                - !Ref GenAIIDPCustomerManagedEncryptionKeyArn
+                - !GetAtt CustomerManagedEncryptionKey.Arn
 
   DynamicFewShotLogGroup:
     Type: AWS::Logs::LogGroup
     Properties:
       LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}"
       RetentionInDays: !Ref LogRetentionDays
-      KmsKeyId: !GetAtt GenAIIDPCustomerManagedEncryptionKeyArn
+      KmsKeyId: !GetAtt CustomerManagedEncryptionKey.Arn
 
   DynamicFewShotVectorBucket:
     Type: AWS::S3Vectors::VectorBucket
-    Metadata:
-      cfn_nag:
-        rules_to_suppress:
-          - id: W84
-            reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
-    # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases"
+    Condition: ShouldCreateVectorBucket
     Properties:
-      VectorBucketName: !Ref VectorBucketName
       EncryptionConfiguration:
-        SseType: "AES256"
+        SseType: "aws:kms"
+        KmsKeyArn: !GetAtt CustomerManagedEncryptionKey.Arn
 
-  DynamicFewShotVectorIndex:
+  DocumentsIndex:
     Type: AWS::S3Vectors::Index
+    Condition: ShouldCreateVectorIndex
     Properties:
-      IndexName: !Ref VectorIndexName
       DataType: "float32"
       Dimension: !Ref VectorDimensions
       DistanceMetric: "cosine"
@@ -204,7 +227,111 @@ Resources:
           - "classPrompt"
           - "attributesPrompt"
           - "imagePath"
-      VectorBucketArn: !Ref DynamicFewShotVectorBucket
+      VectorBucketName: !If
+        - ShouldCreateVectorBucket
+        - !Ref AWS::NoValue
+        - VectorBucketName
+      VectorBucketArn: !If
+        - ShouldCreateVectorBucket
+        - !Ref DynamicFewShotVectorBucket
+        - !Ref AWS::NoValue
+
+  DatasetBucket:
+    Type: AWS::S3::Bucket
+    Condition: ShouldCreateDatasetBucket
+    DeletionPolicy: RetainExceptOnCreate
+    Properties:
+      BucketEncryption:
+        ServerSideEncryptionConfiguration:
+          - ServerSideEncryptionByDefault:
+              SSEAlgorithm: aws:kms
+              KMSMasterKeyID: !Ref CustomerManagedEncryptionKey
+      PublicAccessBlockConfiguration:
+        BlockPublicAcls: true
+        BlockPublicPolicy: true
+        IgnorePublicAcls: true
+        RestrictPublicBuckets: true
+      VersioningConfiguration:
+        Status: Enabled
+      LoggingConfiguration:
+        DestinationBucketName: !Ref IDPS3LoggingBucketName
+        LogFilePrefix: fewshot-dataset-bucket-logs/
+
+  DatasetBucketPolicy:
+    Type: AWS::S3::BucketPolicy
+    Condition: ShouldCreateDatasetBucket
+    Properties:
+      Bucket: !Ref DatasetBucket
+      PolicyDocument:
+        Version: "2012-10-17"
+        Statement:
+          - Sid: EnforceSSLOnly
+            Effect: Deny
+            Principal: "*"
+            Action: "s3:*"
+            Resource:
+              - !Sub "${DatasetBucket.Arn}/*"
+              - !Sub "${DatasetBucket.Arn}"
+            Condition:
+              Bool:
+                "aws:SecureTransport": false
+
+  CustomerManagedEncryptionKey:
+    Type: AWS::KMS::Key
+    Metadata:
+      security-matrix:
+        rules_to_suppress:
+          - id: IAM-005
+            reason: "No cross-account access - only same account root and AWS services"
+          - id: KMS-007
+            reason: "KMS monitoring not required for this IDP solution - comprehensive CloudWatch monitoring already in place"
+          - id: KMS-002
+            reason: "kms:* permission for account root is standard pattern for administrative access to KMS keys"
+    Properties:
+      Description: KMS key for encryption of dynamic few-shot resources
+      EnableKeyRotation: true
+      KeyPolicy:
+        Version: "2012-10-17"
+        Statement:
+          - Sid: Enable IAM User Permissions
+            Effect: Allow
+            Principal:
+              AWS: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:root"
+            Action: kms:*
+            Resource: "*"
+          - Sid: Allow lambda to access the Keys
+            Effect: Allow
+            Principal:
+              AWS: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:root"
+            Action:
+              - kms:Encrypt
+              - kms:Decrypt
+              - kms:ReEncrypt*
+              - kms:GenerateDataKey*
+              - kms:DescribeKey
+            Resource: "*"
+          - Sid: Allow CloudWatch Logs to use the key
+            Effect: Allow
+            Principal:
+              Service: !Sub "logs.${AWS::URLSuffix}"
+            Action:
+              - kms:Encrypt
+              - kms:Decrypt
+              - kms:ReEncrypt*
+              - kms:GenerateDataKey*
+              - kms:DescribeKey
+            Resource: "*"
+          - Sid: Allow S3 Vectors indexing service to use the key
+            Effect: Allow
+            Principal:
+              Service: !Sub "indexing.s3vectors.${AWS::URLSuffix}"
+            Action:
+              - kms:Encrypt
+              - kms:Decrypt
+              - kms:ReEncrypt*
+              - kms:GenerateDataKey*
+              - kms:DescribeKey
+            Resource: "*"
 
 Outputs:
 
@@ -220,17 +347,26 @@ Outputs:
     Description: CloudWatch Log Group for monitoring demo Lambda execution
     Value: !Ref DynamicFewShotLogGroup
 
-  DynamicFewShotVectorBucketArn:
+  VectorBucketName:
     Description: S3 Vectors bucket for dynamic few-shot examples
-    Value: !Ref DynamicFewShotVectorBucket
+    Value: !If
+      - ShouldCreateVectorBucket
+      - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]]
+      - !Ref VectorBucketName
 
-  DynamicFewShotVectorIndexArn:
+  VectorIndexName:
     Description: S3 Vectors index for dynamic few-shot examples
-    Value: !Ref DynamicFewShotVectorIndex
+    Value: !If
+      - ShouldCreateVectorIndex
+      - !Select [3, !Split ["/", !Ref DocumentsIndex]]
+      - !Ref VectorIndexName
 
-  DynamicFewShotDatasetBucket:
-    Description: S3 Bucket for example data sets
-    Value: !Ref DynamicFewShotDatasetBucket
+  DatasetBucket:
+    Description: S3 bucket for example data sets
+    Value: !If
+      - ShouldCreateDatasetBucket
+      - !Ref DatasetBucket
+      - !Ref DatasetBucketName
 
   UsageInstructions:
     Description: How to use this Lambda in your IDP configuration

From 0835cdc0953d0e3475ce2a7cdf8a859e2bfa9b3a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:18:55 +0100
Subject: [PATCH 32/39] chore: allow access to IDP output bucket

---
 ...ic-few-shot.py => IDP-dynamic-few-shot.py} |  0
 plugins/dynamic-few-shot-lambda/template.yml  | 38 +++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)
 rename plugins/dynamic-few-shot-lambda/src/{GENAIIDP-dynamic-few-shot.py => IDP-dynamic-few-shot.py} (100%)

diff --git a/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py
similarity index 100%
rename from plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py
rename to plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py
diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index ca57fa63..7bbe1206 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -100,7 +100,18 @@ Parameters:
   # GenAI IDP parameters
   IDPS3LoggingBucketName:
     Type: String
-    Description: "IDP LoggingBucket Name"
+    Description:
+      IDP LoggingBucket Name, to store access logs for the dataset bucket
+
+  IDPS3OutputBucketName:
+    Type: String
+    Description: >-
+      IDP S3OutputBucketName, to read the documents being processed
+
+  IDPCustomerManagedEncryptionKeyArn:
+    Type: String
+    Description: >-
+      IDP CustomerManagedEncryptionKey ARN, to decrypt documents being read from the output bucket
 
 Conditions:
   HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]]
@@ -131,7 +142,7 @@ Resources:
       FunctionName: !Ref LambdaFunctionName
       PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue]
       CodeUri: ./src
-      Handler: GENAIIDP-dynamic-few-shot.lambda_handler
+      Handler: IDP-dynamic-few-shot.lambda_handler
       Runtime: python3.12
       Architectures:
         - arm64
@@ -144,8 +155,8 @@ Resources:
           S3VECTOR_BUCKET: !If
             - ShouldCreateVectorBucket
             # Error: Requested attribute VectorBucketName must be a readonly property in schema for AWS::S3Vectors::VectorBucket
-            # - !GetAtt DynamicFewShotVectorBucket.VectorBucketName
-            - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]]
+            # - !GetAtt VectorBucket.VectorBucketName
+            - !Select [1, !Split ["/", !Ref VectorBucket]]
             - !Ref VectorBucketName
           S3VECTOR_INDEX: !If
             - ShouldCreateVectorIndex
@@ -167,6 +178,8 @@ Resources:
               - ShouldCreateDatasetBucket
               - !Ref DatasetBucket
               - !Ref DatasetBucketName
+        - S3ReadPolicy:
+            BucketName: !Ref IDPS3OutputBucketName
         - Statement:
             - Effect: Allow
               Action: cloudwatch:PutMetricData
@@ -188,17 +201,14 @@ Resources:
                   - !Ref DocumentsIndex
                   - !If
                     - ShouldCreateVectorBucket
-                    - !Sub "${DynamicFewShotVectorBucket}/index/${DocumentsIndex}"
+                    - !Sub "${VectorBucket}/index/${DocumentsIndex}"
                     - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}"
             - Effect: Allow
               Action:
-                - kms:Encrypt
                 - kms:Decrypt
-                - kms:ReEncrypt*
-                - kms:GenerateDataKey*
-                - kms:DescribeKey
               Resource:
                 - !GetAtt CustomerManagedEncryptionKey.Arn
+                - !Ref IDPCustomerManagedEncryptionKeyArn
 
   DynamicFewShotLogGroup:
     Type: AWS::Logs::LogGroup
@@ -207,7 +217,7 @@ Resources:
       RetentionInDays: !Ref LogRetentionDays
       KmsKeyId: !GetAtt CustomerManagedEncryptionKey.Arn
 
-  DynamicFewShotVectorBucket:
+  VectorBucket:
     Type: AWS::S3Vectors::VectorBucket
     Condition: ShouldCreateVectorBucket
     Properties:
@@ -229,12 +239,8 @@ Resources:
           - "imagePath"
       VectorBucketName: !If
         - ShouldCreateVectorBucket
-        - !Ref AWS::NoValue
+        - !Select [1, !Split ["/", !Ref VectorBucket]]
         - VectorBucketName
-      VectorBucketArn: !If
-        - ShouldCreateVectorBucket
-        - !Ref DynamicFewShotVectorBucket
-        - !Ref AWS::NoValue
 
   DatasetBucket:
     Type: AWS::S3::Bucket
@@ -351,7 +357,7 @@ Outputs:
     Description: S3 Vectors bucket for dynamic few-shot examples
     Value: !If
       - ShouldCreateVectorBucket
-      - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]]
+      - !Select [1, !Split ["/", !Ref VectorBucket]]
       - !Ref VectorBucketName
 
   VectorIndexName:

From b05827cd054614dda0376fac181ec7a112fb0424 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:19:55 +0100
Subject: [PATCH 33/39] chore: fix samconfig.toml

---
 plugins/dynamic-few-shot-lambda/samconfig.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/samconfig.toml b/plugins/dynamic-few-shot-lambda/samconfig.toml
index e25430e5..ce714fd8 100644
--- a/plugins/dynamic-few-shot-lambda/samconfig.toml
+++ b/plugins/dynamic-few-shot-lambda/samconfig.toml
@@ -1,11 +1,10 @@
 version = 0.1
 
 [default.deploy.parameters]
-stack_name = "GENAIIDP-dynamic-few-shot-stack"
+stack_name = "IDP-dynamic-few-shot"
 resolve_s3 = true
-s3_prefix = "GENAIIDP-dynamic-few-shot-stack"
+s3_prefix = "IDP-dynamic-few-shot"
 region = "us-east-1"
 capabilities = "CAPABILITY_IAM"
 disable_rollback = true
-parameter_overrides = "PermissionsBoundaryArn=\"\" VectorBucketName=\"genaiidp-dynamic-few-shot\" VectorIndexName=\"documents\" VectorDimensions=\"3072\" ModelId=\"amazon.nova-2-multimodal-embeddings-v1:0\" TopK=\"2\" LambdaFunctionName=\"GENAIIDP-dynamic-few-shot\""
 image_repositories = []

From c2e5a14f36dff714405520227241bdad632bfd84 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:29:47 +0100
Subject: [PATCH 34/39] chore: add reasoning for cfn_nag

---
 plugins/dynamic-few-shot-lambda/template.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 7bbe1206..3dee77ad 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -127,14 +127,14 @@ Resources:
       cfn_nag:
         rules_to_suppress:
           - id: W89
-            reason: "Demo function - does not require VPC access"
+            reason: "Function does not require VPC access as it only interacts with AWS services via APIs"
           - id: W92
-            reason: "Demo function - does not require reserved concurrency as it scales based on demand"
+            reason: "Function does not require reserved concurrency as it scales based on demand"
           - id: W58
-            reason: "Demo function - DLQ not required"
+            reason: "Function does not require DLQ as processing and retries are handled by the IDP framework"
           - id: W11
-            reason: "Demo function - allow * resource on its permissions policy"
-    # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries"
+            reason: "Allow * resource on its permissions policy for CloudWatch metrics"
+    # checkov:skip=CKV_AWS_116: "Function does not require DLQ"
     # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs"
     # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand"
     # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings"

From c477c444f393b7149ce02ed9da709381621d8e9a Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:31:00 +0100
Subject: [PATCH 35/39] chore: add more reasoning

---
 plugins/dynamic-few-shot-lambda/template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 3dee77ad..fe1a1635 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -134,7 +134,7 @@ Resources:
             reason: "Function does not require DLQ as processing and retries are handled by the IDP framework"
           - id: W11
             reason: "Allow * resource on its permissions policy for CloudWatch metrics"
-    # checkov:skip=CKV_AWS_116: "Function does not require DLQ"
+    # checkov:skip=CKV_AWS_116: "Function does not require DLQ as processing and retries are handled by the IDP framework"
     # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs"
     # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand"
     # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings"

From 1f8eb82bd5f189e3a445bffa3d7f06cc36dfd29d Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:37:55 +0100
Subject: [PATCH 36/39] chore: decode base64 images

---
 lib/idp_common_pkg/idp_common/extraction/service.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py
index ead4e0a1..2bf21e77 100644
--- a/lib/idp_common_pkg/idp_common/extraction/service.py
+++ b/lib/idp_common_pkg/idp_common/extraction/service.py
@@ -10,6 +10,7 @@
 
 from __future__ import annotations
 
+import base64
 import json
 import logging
 import os
@@ -461,6 +462,13 @@ def _convert_image_uris_to_bytes_in_content(
                         f"Invalid file path {image_uri} - expecting S3 path"
                     )
 
+                converted_item = image.prepare_bedrock_image_attachment(image_bytes)
+            elif "image_base64" in item:
+                image_base64 = item["image_base64"]
+
+                # Decode image content
+                image_bytes = base64.b64decode(image_base64)
+
                 converted_item = image.prepare_bedrock_image_attachment(image_bytes)
             elif "image" in item:
                 # Keep existing image objects as-is

From 7dd9f057ae8679684a4f859d0a62d5b94f833528 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:42:56 +0100
Subject: [PATCH 37/39] chore: return base64 encoded images instead of
 image_uri

---
 .../src/IDP-dynamic-few-shot.py                       | 11 ++++++++++-
 plugins/dynamic-few-shot-lambda/template.yml          |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py
index 49aab6c3..1cf760e5 100644
--- a/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py
+++ b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py
@@ -218,7 +218,16 @@ def _build_text_and_image_content(
             # Add images
             if image_content:
                 for image_uri in image_content:
-                    content.append({"image_uri": image_uri})
+                    # Load image content
+                    if image_uri.startswith("s3://"):
+                        # Direct S3 URI
+                        image_bytes = s3.get_binary_content(image_uri)
+                    else:
+                        raise ValueError(f"Invalid file path {image_path} - expecting S3 path")
+
+                    # Convert bytes to base64 string
+                    image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+                    content.append({"image_base64": image_base64})
 
             # Add text after image
             after_text = _prepare_prompt_from_template(
diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index fe1a1635..b7c2548b 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -148,7 +148,7 @@ Resources:
         - arm64
       Timeout: 300
       MemorySize: 512
-      Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors
+      Description: Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors
       Environment:
         Variables:
           LOG_LEVEL: !Ref LogLevel

From 800ed17b140fd7de223a18806afec4465095b339 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 13:48:57 +0100
Subject: [PATCH 38/39] chore: fix parameter

---
 plugins/dynamic-few-shot-lambda/template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index b7c2548b..44405564 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -379,7 +379,7 @@ Outputs:
     Value: !Sub |
       Add this ARN to your extraction config:
       extraction:
-        dynamic_few_shot_lambda_arn: "${DynamicFewShotFunction.Arn}"
+        custom_prompt_lambda_arn: "${DynamicFewShotFunction.Arn}"
         
   MonitoringLink:
     Description: Direct link to CloudWatch logs for this function

From 2eb8573cb493a42074556f628cd5e0437b85eba5 Mon Sep 17 00:00:00 2001
From: Daniel Lorch <lorchda@amazon.ch>
Date: Wed, 17 Dec 2025 14:08:35 +0100
Subject: [PATCH 39/39] chore: fix permission policy for s3 vectors

---
 plugins/dynamic-few-shot-lambda/template.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml
index 44405564..25184437 100644
--- a/plugins/dynamic-few-shot-lambda/template.yml
+++ b/plugins/dynamic-few-shot-lambda/template.yml
@@ -201,8 +201,8 @@ Resources:
                   - !Ref DocumentsIndex
                   - !If
                     - ShouldCreateVectorBucket
-                    - !Sub "${VectorBucket}/index/${DocumentsIndex}"
-                    - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}"
+                    - !Sub "${VectorBucket}/index/${VectorIndexName}"
+                    - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${VectorIndexName}"
             - Effect: Allow
               Action:
                 - kms:Decrypt