From 879f3359befae10e2cf0d707904bcc551403fb5a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 28 Nov 2025 18:49:20 +0100 Subject: [PATCH 01/39] chore: fix missing substitution for custom_prompt_lambda_arn --- patterns/pattern-2/template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml index 6605c8af..2bd9c364 100644 --- a/patterns/pattern-2/template.yaml +++ b/patterns/pattern-2/template.yaml @@ -1026,7 +1026,7 @@ Resources: order: 7 custom_prompt_lambda_arn: type: string - description: "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor" + description: !Sub "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor" order: 8 assessment: order: 5 From 335f87b004a56f54c67c9d59e36526b012a65fba Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 28 Nov 2025 21:29:00 +0100 Subject: [PATCH 02/39] feat: dynamic-few shot Lambda using S3 Vectors --- .../GENAIIDP-dynamic-few-shot.py | 257 +++++++++++++ .../dynamic-few-shot-lambda/README.md | 364 ++++++++++++++++++ .../dynamic-few-shot-lambda/requirements.txt | 1 + .../dynamic-few-shot-lambda/samconfig.toml | 11 + .../dynamic-few-shot-lambda/template.yml | 204 ++++++++++ 5 files changed, 837 insertions(+) create mode 100644 notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py create mode 100644 notebooks/examples/dynamic-few-shot-lambda/README.md create mode 100644 notebooks/examples/dynamic-few-shot-lambda/requirements.txt create mode 100644 notebooks/examples/dynamic-few-shot-lambda/samconfig.toml create mode 100644 notebooks/examples/dynamic-few-shot-lambda/template.yml diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py new file mode 100644 index 00000000..b2c6272d --- /dev/null +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -0,0 +1,257 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +""" +Lambda function to provide examples with ground truth data based on S3 Vectors lookup. + +Key Features Demonstrated: +- Dynamically retrieve similar examples based on document content using vector similarity search +- Provide few-shot examples to improve extraction accuracy through example-based prompting +- Leverage S3 Vectors for efficient similarity search across large example datasets +- Integrate multimodal embeddings using Amazon Nova models for image-based similarity +- Customize example selection based on document characteristics and business rules +""" + +import json +import logging +import base64 +import boto3 +import os + +from idp_common import bedrock, s3 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Parse environment variables with error handling +try: + S3VECTOR_BUCKET = os.environ['S3VECTOR_BUCKET'] + S3VECTOR_INDEX = os.environ['S3VECTOR_INDEX'] + S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS']) + MODEL_ID = os.environ['MODEL_ID'] + TOP_K = int(os.environ['TOP_K']) +except (KeyError, ValueError, IndexError) as e: + logger.error(f"Failed to parse environment variables: {e}") + raise + +# Initialize clients +s3vectors = boto3.client('s3vectors') +bedrock_client = bedrock.BedrockClient() + +def lambda_handler(event, context): + """ + Process a document to find similar examples using S3 Vectors similarity search. + + Input event: + { + "class_label": "", + "document_texts": ["", "", ...], + "image_content": ["", "", ...] + } + + Return format: + [ + { + "attributes_prompt": "expected attributes are: ...", + "class_prompt": "This is an example of the class 'invoice'", + "distance": 0.892344521145, + "image_content": ["", "", ...] + } + ] + """ + + try: + logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===") + logger.debug(f"Complete input event: {json.dumps(event, indent=2)}") + + # Validate input + class_label = event.get("class_label") + document_texts = event.get("document_texts", []) + image_content = event.get("image_content", []) + + logger.info(f"=== INPUT VALUES ===") + logger.info(f"Class label: {class_label if class_label else 'Not specified'}") + logger.info(f"Document texts: {len(document_texts)}") + logger.info(f"Image content: {len(image_content)}") + + # Decode input data + image_data = _decode_images(image_content) + + # Find similar items using S3 vectors lookup from image similarity + result = _s3vectors_find_similar_items(image_data) + + # Log complete output structure + logger.info(f"=== OUTPUT ANALYSIS ===") + logger.debug(f"Complete result: {json.dumps(result, indent=2)}") + logger.info(f"Output items: {len(result)}") + + logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===") + return result + + except Exception as e: + logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===") + logger.error(f"Error type: {type(e).__name__}") + logger.error(f"Error message: {str(e)}") + logger.error(f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}") + # In demo, we'll fail gracefully with detailed error info + raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}") + +def _decode_images(image_content): + """Base64 decode image content to bytes""" + result = [] + for image_base64 in image_content: + image_data = base64.b64decode(image_base64) + result.append(image_data) + return result + +def _encode_images(image_content): + """Base64 encode image content to JSON-serializable string""" + result = [] + for image_bytes in image_content: + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + result.append(image_base64) + return result + +def _s3vectors_find_similar_items(image_data): + """Find similar items for input""" + + # find similar items based on image similarity only + similar_items = {} + for page_image in image_data: + result = _s3vectors_find_similar_items_from_image(image_data) + _merge_examples(similar_items, result) + + # create result set + result = [] + for key, example in similar_items.items(): + metadata = example.get("metadata", {}) + attributes_prompt = metadata.get("attributesPrompt") + + # Only process this example if it has a non-empty attributesPrompt + if not attributes_prompt or not attributes_prompt.strip(): + logger.info( + f"Skipping example with empty attributesPrompt: {key}" + ) + continue + + attributes = _extract_metadata(metadata) + result.append(attributes) + + return result + +def _s3vectors_find_similar_items_from_image(page_image): + """Search for similar items using image query""" + embedding = bedrock_client.generate_embedding( + image_source=page_image, + model_id=MODEL_ID, + dimensions=S3VECTOR_DIMENSIONS, + ) + response = s3vectors.query_vectors( + vectorBucketName=S3VECTOR_BUCKET, + indexName=S3VECTOR_INDEX, + queryVector={"float32": embedding}, + topK=TOP_K, + returnDistance=True, + returnMetadata=True + ) + return response["vectors"] + +def _merge_examples(examples, new_examples): + """ + Merge in-place new examples into the result list, avoiding duplicates. + + Args: + examples: Dict of existing examples + new_examples: List of new examples to be merged + """ + for new_example in new_examples: + key = new_example["key"] + new_distance = new_example.get("distance", 1.0) + + # update example + if combined_examples.get(key): + existing_distance = combined_examples[key].get("distance", 1.0) + examples[key]["distance"] = min(new_distance, existing_distance) + examples[key]["metadata"] = new_example.get("metadata") + # insert example + else: + examples[key] = { + "distance": new_distance, + "metadata": new_example.get("metadata") + } + +def _extract_metadata(metadata, distance): + """Create result object from S3 vectors metadata""" + # Result object attributes + attributes = { + "attributes_prompt": metadata.get("attributesPrompt"), + "class_prompt": metadata.get("classPrompt"), + "distance": distance, + } + + image_path = metadata.get("imagePath") + if image_path: + image_data = _get_image_data_from_s3_path(image_path) + encoded_images = _encode_images(image_data) + attributes["image_content"] = encoded_images + + return attributes + +def _get_image_data_from_s3_path(image_path): + """ + Load images from image path + + Args: + image_path: Path to image file, directory, or S3 prefix + + Returns: + List of images (bytes) + """ + # Get list of image files from the path (supports directories/prefixes) + image_files = _get_image_files_from_s3_path(image_path) + image_content = [] + + # Process each image file + for image_file_path in image_files: + try: + # Load image content + if image_file_path.startswith("s3://"): + # Direct S3 URI + image_bytes = s3.get_binary_content(image_file_path) + else: + raise ValueError( + f"Invalid file path {image_path} - expecting S3 path" + ) + + image_content.append(image_bytes) + except Exception as e: + logger.warning(f"Failed to load image {image_file_path}: {e}") + continue + + return image_content + +def _get_image_files_from_s3_path(image_path): + """ + Get list of image files from an S3 path. + + Args: + image_path: Path to image file, directory, or S3 prefix + + Returns: + List of image file paths/URIs sorted by filename + """ + # Handle S3 URIs + if not image_path.startswith("s3://"): + raise ValueError( + f"Invalid file path {image_path} - expecting S3 URI" + ) + + # Check if it's a direct file or a prefix + if image_path.endswith( + (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp") + ): + # Direct S3 file + return [image_path] + else: + # S3 prefix - list all images + return s3.list_images_from_path(image_path) diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md new file mode 100644 index 00000000..a6f4219e --- /dev/null +++ b/notebooks/examples/dynamic-few-shot-lambda/README.md @@ -0,0 +1,364 @@ +# Dynamic-Few Shot Prompting - Complete Guide + +This directory contains the **complete implementation and demonstration** of the dynamic-few shot prompting feature for GenAI IDP Accelerator. This feature enables users to dynamically retrieve few-shot examples using S3 Vectors similarity search to improve extraction accuracy for Pattern 2. + +## ๐ŸŽฏ Overview + +The dynamic-few shot prompting feature allows you to: + +- **Dynamically retrieve similar examples** based on document content using vector similarity search +- **Provide few-shot examples** to improve extraction accuracy through example-based prompting +- **Leverage S3 Vectors** for efficient similarity search across large example datasets +- **Integrate multimodal embeddings** using Amazon Nova models for image-based similarity +- **Customize example selection** based on document characteristics and business rules + +## ๐Ÿ“ Files in This Directory + +- **`GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup +- **`template.yml`** - CloudFormation SAM template to deploy the complete stack +- **`requirements.txt`** - Python dependencies for the Lambda function +- **`README.md`** - This comprehensive documentation and guide + +## ๐Ÿ—๏ธ Architecture + +```mermaid +flowchart TD + A[Document Processing] --> B{Dynamic-few shot configured?} + B -->|No| C[Use Default Extraction] + B -->|Yes| D[Invoke Dynamic-few shot Lambda] + + subgraph Lambda + D --> E[Receive Document Images] + E --> F[Generate Embeddings with Nova] + F --> G[Query S3 Vectors Index] + G --> H[Retrieve Similar Examples] + H --> I[Load Example Images from S3] + I --> J[Format Examples for Bedrock] + end + + J --> K[Use Examples in Extraction Prompt] + C --> L[Continue with Standard Extraction] + K --> L + + subgraph Input + M[Document Class] + N[Document Text] + O[Document Images] + end + + subgraph Output + P[Example Attributes Prompts] + Q[Example Images] + R[Similarity Distances] + end + + D -.-> M + D -.-> N + D -.-> O + + J -.-> P + J -.-> Q + J -.-> R +``` + +## Quick Start + +### Step 1: Deploy the Dynamic-few shot Stack + +```bash +# Navigate to the dynamic-few-shot-lambda directory +cd notebooks/examples/dynamic-few-shot-lambda + +# Deploy using AWS SAM +sam deploy --guided +``` + +### Step 2: Get the Lambda ARN + +After deployment, get the ARN from CloudFormation outputs: + +```bash +aws cloudformation describe-stacks \ + --stack-name GENAIIDP-dynamic-few-shot-stack \ + --query 'Stacks[0].Outputs[?OutputKey==`DynamicFewShotFunctionArn`].OutputValue' \ + --output text +``` + +### Step 3: Populate the Examples Dataset + +Use the [fewshot_dataset_import.ipynb](../../misc/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack. + +### Step 4: Configure IDP to Use Dynamic-few shot + +Add the Lambda ARN to your IDP extraction configuration: + +```yaml +extraction: + dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" +``` + +## Lambda Interface + +### Input Payload Structure +```json +{ + "class_label": "invoice", + "document_texts": [ + "Invoice text or markdown from page 1...", + "Invoice text or markdown from page 2..." + ], + "image_content": [ + "base64_encoded_image_1", + "base64_encoded_image_2" + ] +} +``` + +### Output Payload Structure +```json +[ + { + "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...", + "class_prompt": "This is an example of the class 'invoice'", + "distance": 0.892344521145, + "image_content": ["", "", ...] + } +] +``` + +## Core Functionality + +### 1. Vector Similarity Search + +The Lambda uses Amazon Nova multimodal embeddings to find similar examples: + +```python +# Generate embedding from document image +embedding = bedrock.generate_embedding( + image_source=image_data, + model_id=MODEL_ID, + dimensions=S3VECTOR_DIMENSIONS, +) + +# Query S3 Vectors for similar examples +response = s3vectors.query_vectors( + vectorBucketName=S3VECTOR_BUCKET, + indexName=S3VECTOR_INDEX, + queryVector={"float32": embedding}, + topK=TOP_K, + returnDistance=True, + returnMetadata=True +) +``` + +### 2. Example Merging and Deduplication + +Multiple document images are processed and results are merged to avoid duplicates: + +```python +def merge_examples(combined_examples, new_examples): + """Merge examples, keeping the best similarity score for duplicates""" + for new_example in new_examples: + key = new_example["key"] + if combined_examples.get(key): + # Keep the better (lower) distance score + combined_examples[key]["distance"] = min( + new_example.get("distance"), + combined_examples[key]["distance"] + ) +``` + +### 3. Example Image Loading + +The Lambda loads example images from S3 paths stored in vector metadata: + +```python +def get_image_files_from_s3_path(image_path: str) -> List[str]: + """Get list of image files from S3 path or prefix""" + if image_path.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")): + return [image_path] # Direct file + else: + return s3.list_images_from_path(image_path) # Directory/prefix +``` + +## Configuration + +### Environment Variables + +The Lambda function uses these environment variables (set by the CloudFormation template): + +- `S3VECTOR_BUCKET` - Name of the S3 Vectors bucket +- `S3VECTOR_INDEX` - Name of the S3 Vectors index +- `S3VECTOR_DIMENSIONS` - Embedding dimensions (e.g. `3072` for Nova Multimodal Embedding model) +- `MODEL_ID` - Bedrock model ID for embeddings (e.g. `amazon.nova-2-multimodal-embeddings-v1:0`) +- `TOP_K` - Number of similar examples to retrieve + +### S3 Vectors Configuration + +The stack creates: +- **Vector Bucket**: Encrypted S3 bucket for vector storage +- **Vector Index**: Cosine similarity index with 3072 dimensions +- **Metadata Configuration**: Stores `classPrompt`, `attributesPrompt`, and `imagePath` as non-filterable metadata keys + +## Monitoring and Troubleshooting + +### CloudWatch Logs + +Monitor the Lambda function logs: +- `/aws/lambda/GENAIIDP-dynamic-few-shot` - Dynamic few-shot Lambda logs + +### Key Log Messages + +**Successful Operation:** +``` +Processing document ID: document-123 +Document class: invoice +Response contains 2 elements +``` + +**Error Conditions:** +``` +No class_label found in event +No document_texts found in event or not in list format +Failed to load example images from s3://bucket/path: error +``` + +### Performance Monitoring + +Key metrics to monitor: +- **Lambda Duration**: Time to retrieve and process examples +- **S3 Vectors Query Time**: Vector similarity search performance +- **Example Count**: Number of examples returned per request +- **Error Rate**: Failed example retrievals + +## Example Dataset Structure + +### Vector Metadata Format + +Each vector in the S3 Vectors index should have metadata: + +```json +{ + "classLabel": "invoice", + "classPrompt": "This is an example of the class 'invoice'", + "attributesPrompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...", + "imagePath": "s3://examples-bucket/invoices/example-001/" +} +``` + +### Image Storage Structure + +Example images should be stored in S3 with paths referenced in metadata: + +``` +s3://examples-bucket/ +โ”œโ”€โ”€ invoices/ +โ”‚ โ”œโ”€โ”€ example-001/ +โ”‚ โ”‚ โ”œโ”€โ”€ page-1.jpg +โ”‚ โ”‚ โ””โ”€โ”€ page-2.jpg +โ”‚ โ””โ”€โ”€ example-002/ +โ”‚ โ””โ”€โ”€ invoice.png +โ””โ”€โ”€ receipts/ + โ”œโ”€โ”€ example-003/ + โ”‚ โ””โ”€โ”€ receipt.jpg + โ””โ”€โ”€ example-004/ + โ””โ”€โ”€ receipt.png +``` + +## Production Considerations + +### 1. Example Dataset Management + +- **Quality Control**: Ensure high-quality, representative examples +- **Regular Updates**: Keep examples current with document variations +- **Metadata Consistency**: Maintain consistent attribute descriptions +- **Image Optimization**: Use appropriate image formats and sizes + +### 2. Performance Optimization + +```python +# Cache frequently accessed examples +# Optimize vector dimensions for your use case +# Use appropriate TOP_K values (typically 2-5) +# Consider batch processing for multiple documents +``` + +### 3. Security Considerations + +- **Access Control**: Restrict access to example datasets +- **Data Privacy**: Ensure examples don't contain sensitive information +- **Encryption**: Use appropriate encryption for stored examples +- **Audit Logging**: Log example usage for compliance + +### 4. Cost Optimization + +- **Vector Index Size**: Monitor storage costs for large example sets +- **Embedding Generation**: Optimize frequency of embedding updates +- **Lambda Memory**: Right-size memory allocation based on usage +- **S3 Storage Classes**: Use appropriate storage classes for examples + +## Deployment Options + +### Option 1: AWS SAM (Recommended) +```bash +sam build +sam deploy --guided +``` + +### Option 2: AWS CLI +```bash +# Package and deploy +aws cloudformation package \ + --template-file template.yml \ + --s3-bucket your-deployment-bucket \ + --output-template-file packaged-template.yml + +aws cloudformation deploy \ + --template-file packaged-template.yml \ + --stack-name GENAIIDP-dynamic-few-shot-stack \ + --capabilities CAPABILITY_IAM +``` + +## Cleanup + +To remove the dynamic-few shot resources: + +```bash +# Delete the CloudFormation stack +aws cloudformation delete-stack --stack-name GENAIIDP-dynamic-few-shot-stack + +# Note: S3 buckets with retention policy will be retained +``` + +## Integration with IDP + +### Configuration in IDP Stack + +Add the dynamic-few shot Lambda ARN to your IDP configuration: + +```yaml +# In your IDP stack parameters or configuration +extraction: + dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" +``` + +### Expected Behavior + +When configured: +1. IDP processes document and extracts images/text +2. Dynamic few-shot Lambda is invoked with document data +3. Lambda returns similar examples with prompts and images +4. IDP includes examples in extraction prompt to Bedrock +5. Bedrock uses examples to improve extraction accuracy + +## Next Steps + +After deploying the dynamic-few shot: + +1. **Populate example dataset** with representative documents +2. **Test similarity search** with sample documents +3. **Monitor performance** and adjust TOP_K as needed +4. **Integrate with IDP** using the Lambda ARN +5. **Evaluate accuracy improvements** with few-shot examples + +The dynamic-few shot enables powerful few-shot learning capabilities while leveraging efficient vector similarity search for dynamic example selection. \ No newline at end of file diff --git a/notebooks/examples/dynamic-few-shot-lambda/requirements.txt b/notebooks/examples/dynamic-few-shot-lambda/requirements.txt new file mode 100644 index 00000000..2048c02c --- /dev/null +++ b/notebooks/examples/dynamic-few-shot-lambda/requirements.txt @@ -0,0 +1 @@ +../../../lib/idp_common_pkg[extraction,docs_service] # extraction module and document service with dependencies diff --git a/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml b/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml new file mode 100644 index 00000000..e25430e5 --- /dev/null +++ b/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml @@ -0,0 +1,11 @@ +version = 0.1 + +[default.deploy.parameters] +stack_name = "GENAIIDP-dynamic-few-shot-stack" +resolve_s3 = true +s3_prefix = "GENAIIDP-dynamic-few-shot-stack" +region = "us-east-1" +capabilities = "CAPABILITY_IAM" +disable_rollback = true +parameter_overrides = "PermissionsBoundaryArn=\"\" VectorBucketName=\"genaiidp-dynamic-few-shot\" VectorIndexName=\"documents\" VectorDimensions=\"3072\" ModelId=\"amazon.nova-2-multimodal-embeddings-v1:0\" TopK=\"2\" LambdaFunctionName=\"GENAIIDP-dynamic-few-shot\"" +image_repositories = [] diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/notebooks/examples/dynamic-few-shot-lambda/template.yml new file mode 100644 index 00000000..927c9a65 --- /dev/null +++ b/notebooks/examples/dynamic-few-shot-lambda/template.yml @@ -0,0 +1,204 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: Deploy demo Lambda function for GenAI IDP dynamic few-shot prompting + +Parameters: + + PermissionsBoundaryArn: + Type: String + Default: "" + Description: >- + (Optional) ARN of an existing IAM Permissions Boundary policy to attach to the Lambda execution role. + Leave blank if no Permissions Boundary is required. + AllowedPattern: "^(|arn:aws[a-z-]*::iam::[0-9]{12}:policy/.+)$" + ConstraintDescription: Must be empty or a valid IAM policy ARN + + VectorBucketName: + Type: String + Default: "genaiidp-dynamic-few-shot" + + VectorIndexName: + Type: String + Default: "documents" + + VectorDimensions: + Type: Number + Default: 3072 + + ModelId: + Type: String + Default: "amazon.nova-2-multimodal-embeddings-v1:0" + + TopK: + Type: Number + Default: 2 + + LambdaFunctionName: + Type: String + Default: "GENAIIDP-dynamic-few-shot" + +Conditions: + HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]] + +Resources: + + DynamicFewShotFunction: + Type: AWS::Serverless::Function + Metadata: + cfn_nag: + rules_to_suppress: + - id: W89 + reason: "Demo function - does not require VPC access" + - id: W92 + reason: "Demo function - does not require reserved concurrency as it scales based on demand" + - id: W58 + reason: "Demo function - DLQ not required" + # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries" + # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs" + # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand" + # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings" + Properties: + FunctionName: !Ref LambdaFunctionName + PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue] + CodeUri: ./ + Handler: GENAIIDP-dynamic-few-shot.lambda_handler + Runtime: python3.12 + Architectures: + - arm64 + Timeout: 300 + MemorySize: 512 + Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting + Environment: + Variables: + LOG_LEVEL: INFO + S3VECTOR_BUCKET: !Ref VectorBucketName + S3VECTOR_INDEX: !Ref VectorIndexName + S3VECTOR_DIMENSIONS: !Ref VectorDimensions + MODEL_ID: !Ref ModelId + TOP_K: !Ref TopK + LoggingConfig: + LogGroup: !Ref DynamicFewShotLogGroup + # Minimal permissions - only needs basic execution and logging + Policies: + - AWSLambdaBasicExecutionRole + - S3ReadPolicy: + BucketName: !Ref DynamicFewShotDatasetBucket + - Statement: + - Effect: Allow + Action: cloudwatch:PutMetricData + Resource: "*" + - Effect: Allow + Action: + - bedrock:InvokeModel + - bedrock:InvokeModelWithResponseStream + Resource: + - !Sub "arn:${AWS::Partition}:bedrock:*::foundation-model/*" + - !Sub "arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:inference-profile/*" + - Effect: Allow + Action: + - s3vectors:GetVectors + - s3vectors:QueryVectors + Resource: + - !Ref DynamicFewShotVectorIndex + + DynamicFewShotLogGroup: + Type: AWS::Logs::LogGroup + Metadata: + cfn_nag: + rules_to_suppress: + - id: W84 + reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + Properties: + LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}" + RetentionInDays: 7 # Short retention for demo purposes + + DynamicFewShotVectorBucket: + Type: AWS::S3Vectors::VectorBucket + Metadata: + cfn_nag: + rules_to_suppress: + - id: W84 + reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + Properties: + VectorBucketName: !Ref VectorBucketName + EncryptionConfiguration: + SseType: "AES256" + + DynamicFewShotVectorIndex: + Type: AWS::S3Vectors::Index + Properties: + IndexName: !Ref VectorIndexName + DataType: "float32" + Dimension: !Ref VectorDimensions + DistanceMetric: "cosine" + MetadataConfiguration: + NonFilterableMetadataKeys: + - "classPrompt" + - "attributesPrompt" + - "imagePath" + VectorBucketArn: !Ref DynamicFewShotVectorBucket + + DynamicFewShotDatasetBucket: + Type: AWS::S3::Bucket + DeletionPolicy: RetainExceptOnCreate + Metadata: + cfn_nag: + rules_to_suppress: + - id: W84 + reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + Properties: + BucketEncryption: + ServerSideEncryptionConfiguration: + - ServerSideEncryptionByDefault: + SSEAlgorithm: "AES256" + PublicAccessBlockConfiguration: + BlockPublicAcls: true + BlockPublicPolicy: true + IgnorePublicAcls: true + RestrictPublicBuckets: true + VersioningConfiguration: + Status: Enabled + +Outputs: + + DynamicFewShotFunctionName: + Description: Name of the demo Lambda function + Value: !Ref DynamicFewShotFunction + + DynamicFewShotFunctionArn: + Description: ARN of the demo Lambda function (use this in your GenAIIDP configuration) + Value: !GetAtt DynamicFewShotFunction.Arn + + DynamicFewShotLogGroup: + Description: CloudWatch Log Group for monitoring demo Lambda execution + Value: !Ref DynamicFewShotLogGroup + + DynamicFewShotVectorBucketArn: + Description: S3 Vectors bucket for dynamic few-shot examples + Value: !Ref DynamicFewShotVectorBucket + + DynamicFewShotVectorIndexArn: + Description: S3 Vectors index for dynamic few-shot examples + Value: !Ref DynamicFewShotVectorIndex + + DynamicFewShotDatasetBucket: + Description: S3 Bucket for example data sets + Value: !Ref DynamicFewShotDatasetBucket + + UsageInstructions: + Description: How to use this Lambda in your IDP configuration + Value: !Sub | + Add this ARN to your extraction config: + extraction: + dynamic_few_shot_lambda_arn: "${DynamicFewShotFunction.Arn}" + + MonitoringLink: + Description: Direct link to CloudWatch logs for this function + Value: !Sub | + https://console.aws.amazon.com/cloudwatch/home?region=${AWS::Region}#logsV2:log-groups/log-group/$252Faws$252Flambda$252F${LambdaFunctionName} \ No newline at end of file From 75eb394632bf5bed11fea7399945f9ee0174ff62 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 14:43:28 +0100 Subject: [PATCH 03/39] chore: remove whitespace --- .../GENAIIDP-dynamic-few-shot.py | 4 ++-- .../examples/dynamic-few-shot-lambda/README.md | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index b2c6272d..94d9f746 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -194,7 +194,7 @@ def _extract_metadata(metadata, distance): image_data = _get_image_data_from_s3_path(image_path) encoded_images = _encode_images(image_data) attributes["image_content"] = encoded_images - + return attributes def _get_image_data_from_s3_path(image_path): @@ -227,7 +227,7 @@ def _get_image_data_from_s3_path(image_path): except Exception as e: logger.warning(f"Failed to load image {image_file_path}: {e}") continue - + return image_content def _get_image_files_from_s3_path(image_path): diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md index a6f4219e..d30ba33d 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/README.md +++ b/notebooks/examples/dynamic-few-shot-lambda/README.md @@ -26,7 +26,7 @@ flowchart TD A[Document Processing] --> B{Dynamic-few shot configured?} B -->|No| C[Use Default Extraction] B -->|Yes| D[Invoke Dynamic-few shot Lambda] - + subgraph Lambda D --> E[Receive Document Images] E --> F[Generate Embeddings with Nova] @@ -35,27 +35,27 @@ flowchart TD H --> I[Load Example Images from S3] I --> J[Format Examples for Bedrock] end - + J --> K[Use Examples in Extraction Prompt] C --> L[Continue with Standard Extraction] K --> L - + subgraph Input M[Document Class] N[Document Text] O[Document Images] end - + subgraph Output P[Example Attributes Prompts] Q[Example Images] R[Similarity Distances] end - + D -.-> M D -.-> N D -.-> O - + J -.-> P J -.-> Q J -.-> R @@ -163,7 +163,7 @@ def merge_examples(combined_examples, new_examples): if combined_examples.get(key): # Keep the better (lower) distance score combined_examples[key]["distance"] = min( - new_example.get("distance"), + new_example.get("distance"), combined_examples[key]["distance"] ) ``` From fbe11b2712b9adc9117700f981abf3a5bc412c35 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 28 Nov 2025 21:40:32 +0100 Subject: [PATCH 04/39] feat: add support for Amazon Titan Multimodal Embeddings G1 and Amazon Nova Multimodal Embeddings --- .../idp_common/bedrock/README.md | 41 ++++++++++ .../idp_common/bedrock/client.py | 78 +++++++++++++++++-- 2 files changed, 112 insertions(+), 7 deletions(-) diff --git a/lib/idp_common_pkg/idp_common/bedrock/README.md b/lib/idp_common_pkg/idp_common/bedrock/README.md index 58c5bd64..b0a67cf7 100644 --- a/lib/idp_common_pkg/idp_common/bedrock/README.md +++ b/lib/idp_common_pkg/idp_common/bedrock/README.md @@ -73,6 +73,47 @@ embedding = client.generate_embedding( # Use embedding for vector search, clustering, etc. ``` +Amazon Titan Multimodal Embeddings support both text and image at the same time. The resulting embeddings vector averages the text embeddings and image embeddings vectors. + +```python +from idp_common.bedrock.client import BedrockClient + +with open("/path/to/document.png", "rb") as image_file: + image_data = image_file.read() + +client = BedrockClient() +embedding = client.generate_embedding( + text="This document contains information about loan applications.", + image_source=image_data, + model_id="amazon.titan-embed-image-v1" +) +``` + +The image source can also be an S3 URI: + +```python +from idp_common.bedrock.client import BedrockClient + +client = BedrockClient() +embedding = client.generate_embedding( + image_data="s3://bucket/key", + model_id="amazon.titan-embed-image-v1" +) +``` + +Amazon Nova Multimodal Embeddings with 3072 dimension size: + +```python +from idp_common.bedrock.client import BedrockClient + +client = BedrockClient() +embedding = client.generate_embedding( + image_data="s3://bucket/key", + model_id="amazon.nova-2-multimodal-embeddings-v1:0", + dimensions=3072 +) +``` + ## Prompt Caching with CachePoint Prompt caching is a powerful feature in Amazon Bedrock that significantly reduces response latency for workloads with repetitive contexts. The Bedrock client provides built-in support for this via the `<>` tag. diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py index 3f19ffe5..42d0df2a 100644 --- a/lib/idp_common_pkg/idp_common/bedrock/client.py +++ b/lib/idp_common_pkg/idp_common/bedrock/client.py @@ -16,6 +16,7 @@ import copy import random import socket +import base64 from typing import Dict, Any, List, Optional, Union, Tuple, Type from botocore.config import Config from botocore.exceptions import ( @@ -25,7 +26,10 @@ EndpointConnectionError, ) from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError - +from idp_common.image import ( + prepare_image, + prepare_bedrock_image_attachment +) # Dummy exception classes for requests timeouts if requests is not available class _RequestsReadTimeout(Exception): @@ -711,22 +715,29 @@ def get_guardrail_config(self) -> Optional[Dict[str, str]]: def generate_embedding( self, - text: str, + text: str = "", + image_source: Optional[Union[str, bytes]] = None, model_id: str = "amazon.titan-embed-text-v1", + dimensions: int = 1024, max_retries: Optional[int] = None, ) -> List[float]: """ - Generate an embedding vector for the given text using Amazon Bedrock. + Generate an embedding vector for the given text or image_source using Amazon Bedrock. + At least one of text or the image is required to generate the embedding. + For Titan Multimodal embedding models, you can include both to create an embeddings query vector that averages the resulting text embeddings and image embeddings vectors. + For Nova Multimodal embedding models, exactly one of text or the image must be present, but not both. Args: text: The text to generate embeddings for + image_source: The image to generate embeddings for (can be either an S3 URI (s3://bucket/key) or raw image bytes) model_id: The embedding model ID to use (default: amazon.titan-embed-text-v1) max_retries: Optional override for the instance's max_retries setting + dimensions: Length of the output embeddings vector Returns: List of floats representing the embedding vector """ - if not text or not isinstance(text, str): + if (not text or not isinstance(text, str)) and (not image_source): # Return an empty vector for empty input return [] @@ -741,12 +752,61 @@ def generate_embedding( # Normalize whitespace and prepare the input text normalized_text = " ".join(text.split()) + # Convert image to base64 + if image_source: + image_bytes = prepare_image(image_source) + image_base64 = base64.b64encode(image_bytes).decode('utf-8') + + dimensions = int(dimensions) + # Prepare the request body based on the model - if "amazon.titan-embed" in model_id: - request_body = json.dumps({"inputText": normalized_text}) + payload_body: Dict[str, Any] = {} + + if "amazon.titan-embed-text" in model_id: + if not normalized_text: + raise ValueError( + "Amazon Titan Text models require a text parameter to generate embeddings for." + ) + payload_body = { + "inputText": normalized_text, + "dimensions": dimensions, + } + elif "amazon.titan-embed-image" in model_id: + payload_body = { + "embeddingConfig": { + "outputEmbeddingLength": dimensions, + } + } + if normalized_text: + payload_body["inputText"] = normalized_text + if image_base64: + payload_body["inputImage"] = image_base64 + elif "amazon.nova-2-multimodal-embeddings" in model_id: + if normalized_text and image_source: + raise ValueError( + "Amazon Nova Multimodal Embedding models require exactly one of text or image parameter, but noth both at the same time." + ) + payload_body = { + "taskType": "SINGLE_EMBEDDING", + "singleEmbeddingParams": { + "embeddingPurpose": "GENERIC_INDEX", + "embeddingDimension": dimensions, + } + } + if normalized_text: + payload_body["singleEmbeddingParams"]["text"] = {"truncationMode": "END", "value": normalized_text} + if image_source: + payload_body["singleEmbeddingParams"].update(prepare_bedrock_image_attachment(image_bytes)) # detect image format + payload_body["singleEmbeddingParams"]["image"]["source"]["bytes"] = image_base64 else: # Default format for other models - request_body = json.dumps({"text": normalized_text}) + if not normalized_text: + raise ValueError( + "Default format requires a text parameter to generate embeddings for." + ) + payload_body = {"text": normalized_text} + + request_body = json.dumps(payload_body) # Call the recursive embedding function return self._generate_embedding_with_retry( @@ -805,6 +865,10 @@ def _generate_embedding_with_retry( # Handle different response formats based on the model if "amazon.titan-embed" in model_id: embedding = response_body.get("embedding", []) + elif "amazon.titan-embed-image" in model_id: + embedding = response_body.get("embedding", []) + elif "amazon.nova-2-multimodal-embeddings" in model_id: + embedding = response_body["embeddings"][0]["embedding"] else: # Default extraction format embedding = response_body.get("embedding", []) From 94d33e7e966924ceda88200db5de2d88ea894f3a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 28 Nov 2025 22:56:01 +0100 Subject: [PATCH 05/39] chore: move idp_common.image import to generate_embedding function, otherwise bedrock client would always require PIL dependency --- lib/idp_common_pkg/idp_common/bedrock/client.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py index 42d0df2a..9167a4ce 100644 --- a/lib/idp_common_pkg/idp_common/bedrock/client.py +++ b/lib/idp_common_pkg/idp_common/bedrock/client.py @@ -26,10 +26,6 @@ EndpointConnectionError, ) from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError -from idp_common.image import ( - prepare_image, - prepare_bedrock_image_attachment -) # Dummy exception classes for requests timeouts if requests is not available class _RequestsReadTimeout(Exception): @@ -737,6 +733,11 @@ def generate_embedding( Returns: List of floats representing the embedding vector """ + from idp_common.image import ( + prepare_image, + prepare_bedrock_image_attachment + ) + if (not text or not isinstance(text, str)) and (not image_source): # Return an empty vector for empty input return [] From 182ec1b8c4a309bb3853f31f852db9f9a9263e35 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 28 Nov 2025 22:59:54 +0100 Subject: [PATCH 06/39] feat: add notebook to ingest FATURA2 dataset into S3 vectors --- notebooks/misc/fewshot_dataset_import.ipynb | 487 ++++++++++++++++++++ 1 file changed, 487 insertions(+) create mode 100644 notebooks/misc/fewshot_dataset_import.ipynb diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/notebooks/misc/fewshot_dataset_import.ipynb new file mode 100644 index 00000000..baca9464 --- /dev/null +++ b/notebooks/misc/fewshot_dataset_import.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Few-shot Dataset Import to S3 Vector store\n", + "\n", + "This notebook demonstrates how to import the FATURA2 dataset into S3 Vectors for use with the examples-provider Lambda function.\n", + "\n", + "The FATURA2 dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n", + "\n", + "## Process Overview:\n", + "\n", + "1. **Load FATURA2 Dataset** - Download and process the dataset\n", + "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n", + "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n", + "4. **Verify Import** - Test similarity search functionality\n", + "\n", + "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's make sure that modules are autoreloaded\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "ROOTDIR=\"../..\"\n", + "# First uninstall existing package (to ensure we get the latest version)\n", + "%pip uninstall -y idp_common\n", + "\n", + "# Install the IDP common package with all components in development mode\n", + "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n", + "\n", + "# Note: We can also install specific components like:\n", + "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n", + "\n", + "# Check installed version\n", + "%pip show idp_common | grep -E \"Version|Location\"\n", + "\n", + "# Install required packages\n", + "%pip install -q pillow requests tqdm pandas\n", + "\n", + "# Optionally use a .env file for environment variables\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv() \n", + "except ImportError:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import zipfile\n", + "import requests\n", + "from pathlib import Path\n", + "from typing import Dict, List, Any\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "\n", + "import boto3\n", + "from PIL import Image\n", + "\n", + "# Import IDP common modules\n", + "from idp_common import bedrock\n", + "\n", + "print(\"Libraries imported successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Configure S3 Vectors and Bedrock" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n", + "S3_BUCKET_FOR_IMAGES = \"genaiidp-dynamic-few-shot-dynamicfewshotdatasetbuc-nuz4jeue5hds\" # Stack output 'DynamicFewShotDatasetBucket'\n", + "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n", + "S3_VECTORS_INDEX = \"documents\"\n", + "\n", + "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n", + "EMBEDDING_DIMENSIONS = 3072\n", + "\n", + "# Initialize clients\n", + "s3vectors_client = boto3.client('s3vectors')\n", + "s3_client = boto3.client('s3')\n", + "bedrock_client = bedrock.BedrockClient()\n", + "\n", + "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n", + "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n", + "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load FATURA2 Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download and extract FATURA2 dataset from Zenodo\n", + "print(\"Downloading FATURA2 dataset...\")\n", + "\n", + "# Configuration for this dataset\n", + "IMAGE_VARIANT = 'colored_images'\n", + "ANNOTATION_VARIANT = 'Original_Format'\n", + "CLASS_LABEL = 'invoice'\n", + "\n", + "# Create datasets directory\n", + "datasets_dir = Path('datasets')\n", + "datasets_dir.mkdir(exist_ok=True)\n", + "\n", + "# Download the zip file\n", + "zip_url = 'https://zenodo.org/records/10371464/files/FATURA2.zip?download=1'\n", + "zip_path = datasets_dir / 'FATURA2.zip'\n", + "\n", + "if not zip_path.exists():\n", + " response = requests.get(zip_url, stream=True)\n", + " response.raise_for_status()\n", + " \n", + " with open(zip_path, 'wb') as f:\n", + " for chunk in tqdm(response.iter_content(chunk_size=8192), desc='Downloading'):\n", + " f.write(chunk)\n", + " print(f\"Downloaded {zip_path}\")\n", + "else:\n", + " print(f\"Using existing {zip_path}\")\n", + "\n", + "# Extract the zip file\n", + "extract_dir = datasets_dir / 'invoices_dataset_final'\n", + "if not extract_dir.exists():\n", + " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", + " zip_ref.extractall(datasets_dir)\n", + " print(f\"Extracted to {extract_dir}\")\n", + "else:\n", + " print(f\"Using existing {extract_dir}\")\n", + "\n", + "colored_images = extract_dir / IMAGE_VARIANT\n", + "\n", + "# Load images from extracted directory\n", + "image_files = list(colored_images.glob('**/*.jpg'))\n", + "print(f\"Found {len(image_files)} {IMAGE_VARIANT} files\")\n", + "\n", + "# Show sample\n", + "if image_files:\n", + " sample_image = Image.open(image_files[0])\n", + " print(f\"Sample image: {image_files[0].name}\")\n", + " print(f\"Image size: {sample_image.size}\")\n", + "\n", + "print(f\"Image variant: {IMAGE_VARIANT}\")\n", + "print(f\"Annotation variant: {ANNOTATION_VARIANT}\")\n", + "print(f\"Class label: {CLASS_LABEL}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Process Dataset and Generate Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n", + " \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n", + " s3_client.put_object(\n", + " Bucket=S3_BUCKET_FOR_IMAGES,\n", + " Key=s3_key,\n", + " Body=image_bytes,\n", + " ContentType='image/jpeg'\n", + " )\n", + " return f\"s3://{S3_BUCKET_FOR_IMAGES}/{s3_key}\"\n", + "\n", + "def load_split(extract_dir, split_name):\n", + " csv_path = extract_dir / (split_name + \".csv\")\n", + " return pd.read_csv(csv_path)\n", + "\n", + "def read_annotation(extract_dir, annot_path):\n", + " json_path = extract_dir / \"Annotations\" / ANNOTATION_VARIANT / annot_path\n", + " with open(json_path, \"r\") as f:\n", + " annotation = f.read()\n", + " return json.loads(annotation)\n", + "\n", + "def load_image(extract_dir, img_path):\n", + " image_path = extract_dir / IMAGE_VARIANT / img_path\n", + " with open(image_path, \"rb\") as f:\n", + " image_content = f.read()\n", + " return image_content\n", + "\n", + "def map_labels(annotations):\n", + " labels = {}\n", + " labels['invoice_number'] = annotations.get(\"NUMBER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['invoice_date'] = annotations.get(\"DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['due_date'] = annotations.get(\"DUE_DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['vendor_name'] = annotations.get(\"SELLER_NAME\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['vendor_address'] = annotations.get(\"SELLER_ADDRESS\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " BUYER = annotations.get(\"BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['customer_name'] = BUYER[0] if len(BUYER) > 0 else []\n", + " labels['customer_address'] = BUYER[1:] if len(BUYER) > 1 else []\n", + " labels['items'] = \"null\"\n", + " labels['quantities'] = \"null\"\n", + " labels['unit_prices'] = \"null\"\n", + " labels['subtotal'] = annotations.get(\"SUB_TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['tax'] = annotations.get(\"TAX\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['total_amount'] = annotations.get(\"TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['payment_terms'] = annotations.get(\"NOTE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " labels['po_number'] = annotations.get(\"GSTIN_BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", + " return labels\n", + "\n", + "def get_attributes_prompt(labels):\n", + " attributes_prompt = f\"\"\"expected attributes are:\n", + " \"invoice_number\": {\", \".join(labels['invoice_number'])}\n", + " \"invoice_date\": {\", \".join(labels['invoice_date'])}\n", + " \"due_date\": {\", \".join(labels['due_date'])}\n", + " \"vendor_name\": {\", \".join(labels['vendor_name'])}\n", + " \"vendor_address\": {\", \".join(labels['vendor_address'])}\n", + " \"customer_name\": {labels['customer_name']}\n", + " \"customer_address\": {\", \".join(labels['customer_address'])}\n", + " \"items\": {labels['items']}\n", + " \"quantities\": {labels['quantities']}\n", + " \"unit_prices\": {labels['unit_prices']}\n", + " \"subtotal\": {\", \".join(labels['subtotal'])}\n", + " \"tax\": {\", \".join(labels['tax'])}\n", + " \"total_amount\": {\", \".join(labels['total_amount'])}\n", + " \"payment_terms\": {\", \".join(labels['payment_terms'])}\n", + " \"po_number\": {\", \".join(labels['po_number'])}\n", + " \"\"\".strip()\n", + " return attributes_prompt\n", + "\n", + "def create_metadata(annotations: Dict, s3_image_uri: str) -> Dict:\n", + " \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n", + " class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n", + "\n", + " labels = map_labels(annotations)\n", + " attributes_prompt = get_attributes_prompt(labels)\n", + "\n", + " return {\n", + " \"classLabel\": CLASS_LABEL,\n", + " \"classPrompt\": class_prompt,\n", + " \"attributesPrompt\": attributes_prompt,\n", + " \"imagePath\": s3_image_uri,\n", + " }\n", + "\n", + "print(\"Helper functions defined\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Import Dataset to S3 Vectors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process a subset of the dataset (adjust as needed)\n", + "MAX_SAMPLES = 100 # Adjust this number based on your needs\n", + "BATCH_SIZE = 10 # Adjust this number based on your needs\n", + "\n", + "dataset_split = load_split(extract_dir, \"strat1_train\")\n", + "samples_to_process = min(MAX_SAMPLES, len(dataset_split))\n", + "\n", + "print(f\"Processing {samples_to_process} samples from FATURA2 dataset...\")\n", + "\n", + "vectors_to_upload = []\n", + "failed_samples = []\n", + "\n", + "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n", + " try:\n", + " df_image = dataset_split.iloc[i]\n", + "\n", + " # Load annotations\n", + " annotations = read_annotation(extract_dir, df_image[\"annot_path\"])\n", + " \n", + " # Load image\n", + " image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n", + "\n", + " # Upload image to S3\n", + " s3_key = f\"fatura2/{IMAGE_VARIANT}/{df_image['img_path']}\"\n", + " s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n", + " \n", + " # Generate embedding\n", + " embedding = bedrock_client.generate_embedding(\n", + " image_source=image_bytes,\n", + " model_id=EMBEDDING_MODEL_ID,\n", + " dimensions=EMBEDDING_DIMENSIONS\n", + " )\n", + " \n", + " # Create metadata\n", + " metadata = create_metadata(annotations, s3_image_uri)\n", + "\n", + " # Prepare vector for upload\n", + " vector_entry = {\n", + " \"key\": f\"fatura2_sample_{i:06d}\",\n", + " \"data\": {\"float32\": embedding},\n", + " \"metadata\": metadata\n", + " }\n", + "\n", + " vectors_to_upload.append(vector_entry)\n", + " \n", + " # Upload in batches to avoid memory issues\n", + " if len(vectors_to_upload) >= BATCH_SIZE: # Batch size\n", + " print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n", + " response = s3vectors_client.put_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " vectors=vectors_to_upload\n", + " )\n", + " print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", + " vectors_to_upload = [] # Clear batch\n", + " \n", + " except Exception as e:\n", + " print(f\"\\nFailed to process sample {i}: {e}\")\n", + " failed_samples.append(i)\n", + " continue\n", + "\n", + "# Upload remaining vectors\n", + "if vectors_to_upload:\n", + " print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n", + " response = s3vectors_client.put_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " vectors=vectors_to_upload\n", + " )\n", + " print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", + "\n", + "print(f\"\\nImport completed!\")\n", + "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples\")\n", + "print(f\"Failed samples: {len(failed_samples)}\")\n", + "if failed_samples:\n", + " print(f\"Failed sample indices: {failed_samples[:10]}...\") # Show first 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Verify Import with Similarity Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test similarity search with a sample from the dataset\n", + "test_split = load_split(extract_dir, \"strat1_test\")\n", + "\n", + "test_sample_index = 0\n", + "df_image = test_split.iloc[test_sample_index]\n", + "\n", + "test_image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n", + "\n", + "print(f\"Testing similarity search with sample {extract_dir / IMAGE_VARIANT / df_image['img_path']}...\")\n", + "\n", + "# Generate embedding for test image\n", + "test_embedding = bedrock_client.generate_embedding(\n", + " image_source=test_image_bytes,\n", + " model_id=EMBEDDING_MODEL_ID,\n", + " dimensions=EMBEDDING_DIMENSIONS\n", + ")\n", + "\n", + "# Query S3 Vectors for similar examples\n", + "response = s3vectors_client.query_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " queryVector={\"float32\": test_embedding},\n", + " topK=5,\n", + " returnDistance=True,\n", + " returnMetadata=True\n", + ")\n", + "\n", + "print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n", + "for i, vector in enumerate(response['vectors']):\n", + " distance = vector.get('distance', 'N/A')\n", + " key = vector.get('key', 'N/A')\n", + " metadata = vector.get('metadata', {})\n", + " class_label = metadata.get('classLabel', 'N/A')\n", + " class_prompt = metadata.get('classPrompt', 'N/A')\n", + " attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n", + " image_path = metadata.get('imagePath', 'N/A')\n", + " \n", + " print(f\" {i+1}. Key: {key}\")\n", + " print(f\" Distance: {distance:.4f}\")\n", + " print(f\" Class Label: {image_path}\")\n", + " print(f\" Class Prompt: {class_prompt}\")\n", + " print(f\" Attributes Prompt: {attributes_prompt}\")\n", + " print(f\" Image Path: {image_path}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Summary and Next Steps" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=== Few-shot Dataset Import Summary ===\")\n", + "print(f\"โœ… Dataset: FATURA2 (Invoice documents)\")\n", + "print(f\"โœ… Samples processed: {samples_to_process - len(failed_samples)}\")\n", + "print(f\"โœ… S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n", + "print(f\"โœ… S3 Vectors Index: {S3_VECTORS_INDEX}\")\n", + "print(f\"โœ… Images stored in: s3://{S3_BUCKET_FOR_IMAGES}/fatura2/{IMAGE_VARIANT}/\")\n", + "print(f\"โœ… Embedding Model: {EMBEDDING_MODEL_ID}\")\n", + "print(f\"โœ… Similarity search verified\")\n", + "\n", + "print(\"\\n=== Next Steps ===\")\n", + "print(\"1. Upload your own datasets into S3 Vectors\")\n", + "print(\"2. Configure your IDP extraction to use the examples provider Lambda ARN\")\n", + "print(\"3. Test document processing with few-shot examples!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 1e7cac3df80c6cdfe3c1595f175511e1691910cb Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Mon, 1 Dec 2025 21:06:04 +0100 Subject: [PATCH 07/39] chore: update input parameter for document_text + fixes --- .../GENAIIDP-dynamic-few-shot.py | 21 ++++++++++--------- .../dynamic-few-shot-lambda/README.md | 5 +---- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index 94d9f746..61b69295 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -21,7 +21,8 @@ from idp_common import bedrock, s3 logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) +level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")) +logger.setLevel(level) # Parse environment variables with error handling try: @@ -45,7 +46,7 @@ def lambda_handler(event, context): Input event: { "class_label": "", - "document_texts": ["", "", ...], + "document_text": "", "image_content": ["", "", ...] } @@ -66,13 +67,13 @@ def lambda_handler(event, context): # Validate input class_label = event.get("class_label") - document_texts = event.get("document_texts", []) + document_text = event.get("document_text") image_content = event.get("image_content", []) logger.info(f"=== INPUT VALUES ===") logger.info(f"Class label: {class_label if class_label else 'Not specified'}") - logger.info(f"Document texts: {len(document_texts)}") - logger.info(f"Image content: {len(image_content)}") + logger.info(f"Document text: {len(document_text) if document_text else "0"} bytes") + logger.info(f"Image content: {len(image_content)} images") # Decode input data image_data = _decode_images(image_content) @@ -114,17 +115,17 @@ def _encode_images(image_content): def _s3vectors_find_similar_items(image_data): """Find similar items for input""" - # find similar items based on image similarity only similar_items = {} for page_image in image_data: - result = _s3vectors_find_similar_items_from_image(image_data) + result = _s3vectors_find_similar_items_from_image(page_image) _merge_examples(similar_items, result) # create result set result = [] for key, example in similar_items.items(): metadata = example.get("metadata", {}) + distance = example.get("distance") attributes_prompt = metadata.get("attributesPrompt") # Only process this example if it has a non-empty attributesPrompt @@ -134,7 +135,7 @@ def _s3vectors_find_similar_items(image_data): ) continue - attributes = _extract_metadata(metadata) + attributes = _extract_metadata(metadata, distance) result.append(attributes) return result @@ -169,8 +170,8 @@ def _merge_examples(examples, new_examples): new_distance = new_example.get("distance", 1.0) # update example - if combined_examples.get(key): - existing_distance = combined_examples[key].get("distance", 1.0) + if examples.get(key): + existing_distance = examples[key].get("distance", 1.0) examples[key]["distance"] = min(new_distance, existing_distance) examples[key]["metadata"] = new_example.get("metadata") # insert example diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md index d30ba33d..e30f913e 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/README.md +++ b/notebooks/examples/dynamic-few-shot-lambda/README.md @@ -103,10 +103,7 @@ extraction: ```json { "class_label": "invoice", - "document_texts": [ - "Invoice text or markdown from page 1...", - "Invoice text or markdown from page 2..." - ], + "document_text": "Text or markdown from section 1 (pages 1-3)...", "image_content": [ "base64_encoded_image_1", "base64_encoded_image_2" From 99a3605aaeebe61c07d87851b87a6b6c9b59a572 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Mon, 1 Dec 2025 21:09:19 +0100 Subject: [PATCH 08/39] feat: add notebook for dynamic few-shot Lambda testing --- .../config/extraction_with_few_shot.yaml | 101 +++++ ...ep3_extraction_with_dynamic_few_shot.ipynb | 420 ++++++++++++++++++ 2 files changed, 521 insertions(+) create mode 100644 notebooks/examples/config/extraction_with_few_shot.yaml create mode 100644 notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb diff --git a/notebooks/examples/config/extraction_with_few_shot.yaml b/notebooks/examples/config/extraction_with_few_shot.yaml new file mode 100644 index 00000000..addd9a01 --- /dev/null +++ b/notebooks/examples/config/extraction_with_few_shot.yaml @@ -0,0 +1,101 @@ +# Extraction Service Configuration +extraction: + top_p: '0.1' + max_tokens: '4096' + top_k: '5' + temperature: '0.0' + model: us.amazon.nova-pro-v1:0 + system_prompt: >- + You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. + task_prompt: >- + + + You are an expert in document analysis and information extraction. + You can understand and extract key information from documents classified as type + + {DOCUMENT_CLASS}. + + + + + + + Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. + Then, extract the relevant information from the text and populate the corresponding values in the JSON object. + + + + + + + Guidelines: + 1. Ensure that the data is accurately represented and properly formatted within + the JSON structure + 2. Include double quotes around all keys and values + 3. Do not make up data - only extract information explicitly found in the + document + 4. Do not use /n for new lines, use a space instead + 5. If a field is not found or if unsure, return null + 6. All dates should be in MM/DD/YYYY format + 7. Do not perform calculations or summations unless totals are explicitly given + 8. If an alias is not found in the document, return null + 9. Guidelines for checkboxes: + 9.A. CAREFULLY examine each checkbox, radio button, and selection field: + - Look for marks like โœ“, โœ—, x, filled circles (โ—), darkened areas, or handwritten checks indicating selection + - For checkboxes and multi-select fields, ONLY INCLUDE options that show clear visual evidence of selection + - DO NOT list options that have no visible selection mark + 9.B. For ambiguous or overlapping tick marks: + - If a mark overlaps between two or more checkboxes, determine which option contains the majority of the mark + - Consider a checkbox selected if the mark is primarily inside the check box or over the option text + - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward. + - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment. + 10. Think step by step first and then answer. + + + + If the attributes section below contains a list of attribute names and + descriptions, then output only those attributes, using the provided + descriptions as guidance for finding the correct values. + + + + {ATTRIBUTE_NAMES_AND_DESCRIPTIONS} + + + + + + {FEW_SHOT_EXAMPLES} + + + + <> + + + + + {DOCUMENT_TEXT} + + + + + + + {DOCUMENT_IMAGE} + + + + + + + Extract key information from the document and return a JSON object with the following key steps: + 1. Carefully analyze the document text to identify the requested attributes + 2. Extract only information explicitly found in the document - never make up data + 3. Format all dates as MM/DD/YYYY and replace newlines with spaces + 4. For checkboxes, only include options with clear visual selection marks + 5. Use null for any fields not found in the document + 6. Ensure the output is properly formatted JSON with quoted keys and values + 7. Think step by step before finalizing your answer + + + diff --git a/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb b/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb new file mode 100644 index 00000000..5d5a0663 --- /dev/null +++ b/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 3: Dynamic Few-Shot Information Extraction\n", + "\n", + "This notebook demonstrates the **dynamic few-shot prompting feature** for Pattern 2. It shows how to:\n", + "\n", + "- Configure dynamic few-shot Lambda functions extraction\n", + "- Compare default vs examples-enhanced extraction results\n", + "- Inspect Lambda payloads and responses\n", + "- Handle errors and monitor performance\n", + "\n", + "**Prerequisites:**\n", + "- Completed Step 2 (Classification)\n", + "- AWS Lambda permissions to create/invoke functions\n", + "- Dynamic few-shot Lambda function deployed\n", + "- S3 Vectors index populated with examples (`notebooks/misc/fewshot_dataset_import.ipynb`)\n", + "\n", + "**Key Feature:**\n", + "The `dynamic_few_shot_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup and Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import time\n", + "import logging\n", + "import boto3\n", + "from pathlib import Path\n", + "import yaml\n", + "\n", + "# Import IDP libraries\n", + "from idp_common.models import Document, Status\n", + "from idp_common.s3 import get_json_content\n", + "from idp_common import extraction\n", + "\n", + "# Configure logging to see Lambda invocation details\n", + "logging.basicConfig(level=logging.INFO)\n", + "logging.getLogger('idp_common.extraction').setLevel(logging.INFO)\n", + "logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)\n", + "\n", + "print(\"Libraries imported successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Previous Step Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load document from previous step\n", + "classification_data_dir = Path(\".data/step2_classification\")\n", + "\n", + "# Load document object from JSON\n", + "document_path = classification_data_dir / \"document.json\"\n", + "with open(document_path, 'r') as f:\n", + " document = Document.from_json(f.read())\n", + "\n", + "# Load configuration directly from config files\n", + "config_dir = Path(\"config\")\n", + "CONFIG = {}\n", + "\n", + "# Load each configuration file\n", + "config_files = [\n", + " \"extraction_with_few_shot.yaml\",\n", + " \"classes.yaml\"\n", + "]\n", + "\n", + "for config_file in config_files:\n", + " config_path = config_dir / config_file\n", + " if config_path.exists():\n", + " with open(config_path, 'r') as f:\n", + " file_config = yaml.safe_load(f)\n", + " CONFIG.update(file_config)\n", + " print(f\"Loaded {config_file}\")\n", + " else:\n", + " print(f\"Warning: {config_file} not found\")\n", + "\n", + "# Load environment info\n", + "env_path = classification_data_dir / \"environment.json\"\n", + "with open(env_path, 'r') as f:\n", + " env_info = json.load(f)\n", + "\n", + "# Set environment variables\n", + "os.environ['AWS_REGION'] = env_info['region']\n", + "os.environ['METRIC_NAMESPACE'] = 'IDP-Dynamic-Few-Shot'\n", + "\n", + "print(f\"Loaded document: {document.id}\")\n", + "print(f\"Document status: {document.status.value}\")\n", + "print(f\"Number of sections: {len(document.sections) if document.sections else 0}\")\n", + "print(f\"Loaded configuration sections: {list(CONFIG.keys())}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Configure Dynamic Few-Shot Lambda ARN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ๐Ÿ”ง CONFIGURATION: Set your dynamic few-shot Lambda ARN here\n", + "# Replace with your actual Lambda function ARN for live testing\n", + "\n", + "# Check if dynamic few-shot Lambda function exists\n", + "lambda_client = boto3.client('lambda')\n", + "DYNAMIC_FEW_SHOT_LAMBDA_ARN = None\n", + "\n", + "try:\n", + " response = lambda_client.get_function(FunctionName='GENAIIDP-dynamic-few-shot')\n", + " DYNAMIC_FEW_SHOT_LAMBDA_ARN = response['Configuration']['FunctionArn']\n", + " print(f\"โœ… Found dynamic few-shot Lambda function: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", + "except lambda_client.exceptions.ResourceNotFoundException:\n", + " print(\"โš ๏ธ Dynamic Few-Shot Lambda function not found: GENAIIDP-dynamic-few-shot\")\n", + " print(\"๐Ÿ’ก Deploy using: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n", + "except Exception as e:\n", + " print(f\"Error checking Lambda function: {e}\")\n", + "\n", + "if not DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n", + " print(\"โš ๏ธ No dynamic few-shot Lambda ARN configured\")\n", + " print(\"๐Ÿ’ก This demo will show standard extraction without few-shot examples\")\n", + " print(\"๐Ÿ”ง To test with examples, deploy the dynamic few-shot Lambda first\")\n", + "else:\n", + " print(f\"โœ… Dynamic few-shot Lambda ARN configured: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", + " print(\"๐Ÿš€ This demo will use few-shot examples from S3 Vectors\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Extraction Comparison: Default vs Dynamic Few-Shot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1 Default Extraction (Without Dynamic Few-Shot)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create configuration WITHOUT dynamic few-shot Lambda\n", + "config_default = CONFIG.copy()\n", + "if 'dynamic_few_shot_lambda_arn' in config_default.get('extraction', {}):\n", + " del config_default['extraction']['dynamic_few_shot_lambda_arn']\n", + "\n", + "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n", + "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n", + "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('dynamic_few_shot_lambda_arn', 'None')}\")\n", + "\n", + "# Create extraction service with default config\n", + "extraction_service_default = extraction.ExtractionService(config=config_default)\n", + "print(\"\\nโœ… Default extraction service initialized\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run default extraction on first section\n", + "if document.sections:\n", + " first_section = document.sections[0]\n", + " print(f\"๐Ÿ”„ Processing section {first_section.section_id} with DEFAULT prompts\")\n", + " print(f\"Classification: {first_section.classification}\")\n", + " print(f\"Pages: {first_section.page_ids}\")\n", + " \n", + " # Save original document state\n", + " document_default = Document.from_json(document.to_json())\n", + " \n", + " # Process with default extraction\n", + " start_time = time.time()\n", + " document_default = extraction_service_default.process_document_section(\n", + " document=document_default,\n", + " section_id=first_section.section_id\n", + " )\n", + " default_extraction_time = time.time() - start_time\n", + " \n", + " print(f\"โœ… Default extraction completed in {default_extraction_time:.2f} seconds\")\n", + "\n", + " # Store results for comparison\n", + " default_section_result = None\n", + " for section in document_default.sections:\n", + " if section.section_id == first_section.section_id:\n", + " default_section_result = section\n", + " break\n", + " \n", + "else:\n", + " print(\"โš ๏ธ No sections found in document\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show section extraction result\n", + "if default_section_result:\n", + " print(f\"\\nSection {default_section_result.section_id} extraction result:\")\n", + " extraction_result_uri = default_section_result.extraction_result_uri\n", + "\n", + " if extraction_result_uri:\n", + " result = get_json_content(extraction_result_uri)\n", + " result_json = json.dumps(result[\"inference_result\"], indent=2)\n", + " print(result_json)\n", + "\n", + "else:\n", + " print(\"โš ๏ธ No sections found in document\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2 Dynamic Few-Shot Extraction using Lambda" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n", + " # Create configuration WITH dynamic few-shot Lambda\n", + " config_few_shot = CONFIG.copy()\n", + " config_few_shot['extraction']['dynamic_few_shot_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n", + " \n", + " print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n", + " print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n", + " print(f\"Dynamic Few-Shot Lambda: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", + " print(f\"Lambda Function Name: {DYNAMIC_FEW_SHOT_LAMBDA_ARN.split(':')[-1]}\")\n", + " \n", + " # Create extraction service with dynamic few-shot config\n", + " extraction_service_few_shot = extraction.ExtractionService(config=config_few_shot)\n", + " \n", + " print(\"\\nโœ… Dynamic few-shot extraction service initialized\")\n", + " \n", + "else:\n", + " print(\"โš ๏ธ No dynamic few-shot Lambda ARN configured - skipping demonstration\")\n", + " config_few_shot = None\n", + " extraction_service_few_shot = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run dynamic few-shot extraction on first section\n", + "if DYNAMIC_FEW_SHOT_LAMBDA_ARN and document.sections:\n", + " first_section = document.sections[0]\n", + " print(f\"๐Ÿ”„ Processing section {first_section.section_id} with DYNAMIC FEW-SHOT\")\n", + " print(f\"Classification: {first_section.classification}\")\n", + " print(f\"Pages: {first_section.page_ids}\")\n", + " \n", + " # Create fresh document copy for examples processing\n", + " document_few_shot = Document.from_json(document.to_json())\n", + " \n", + " # Process with dynamic few-shot extraction\n", + " start_time = time.time()\n", + " \n", + " try:\n", + " document_few_shot = extraction_service_few_shot.process_document_section(\n", + " document=document_few_shot,\n", + " section_id=first_section.section_id\n", + " )\n", + " few_shot_extraction_time = time.time() - start_time\n", + " \n", + " print(f\"โœ… Dynamic few-shot extraction completed in {few_shot_extraction_time:.2f} seconds\")\n", + " \n", + " # Store results for comparison\n", + " few_shot_section_result = None\n", + " for section in document_few_shot.sections:\n", + " if section.section_id == first_section.section_id:\n", + " few_shot_section_result = section\n", + " break\n", + " \n", + " # Performance comparison\n", + " overhead = few_shot_extraction_time - default_extraction_time\n", + " print(f\"\\n๐Ÿ“Š Performance Comparison:\")\n", + " print(f\" Default: {default_extraction_time:.2f}s\")\n", + " print(f\" Dynamic Few-Shot: {few_shot_extraction_time:.2f}s\")\n", + " print(f\" Dynamic Few-Shot Overhead: {overhead:.2f}s ({overhead/default_extraction_time*100:.1f}% increase)\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Dynamic few-shot extraction failed: {e}\")\n", + " print(\"\\n๐Ÿ” This demonstrates the fail-fast error handling behavior\")\n", + " few_shot_section_result = None\n", + " few_shot_extraction_time = None\n", + " \n", + "else:\n", + " print(\"โš ๏ธ Skipping dynamic few-shot extraction (no Lambda configured or no sections)\")\n", + " document_few_shot = None\n", + " few_shot_section_result = None\n", + " few_shot_extraction_time = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show section extraction result\n", + "if few_shot_section_result:\n", + " print(f\"\\nSection {few_shot_section_result.section_id} extraction result:\")\n", + " extraction_result_uri = few_shot_section_result.extraction_result_uri\n", + "\n", + " if extraction_result_uri:\n", + " result = get_json_content(extraction_result_uri)\n", + " result_json = json.dumps(result[\"inference_result\"], indent=2)\n", + " print(result_json)\n", + "\n", + "else:\n", + " print(\"โš ๏ธ No sections found in document\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Results and Summary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=== DEMO COMPLETE: SUMMARY ===\")\n", + "\n", + "sections_processed = 1 if document.sections else 0\n", + "dynamic_few_shot_used = DYNAMIC_FEW_SHOT_LAMBDA_ARN is not None\n", + "\n", + "print(f\"\\nโœ… DEMO RESULTS:\")\n", + "print(f\" ๐Ÿ“„ Document processed: {document.id}\")\n", + "print(f\" ๐Ÿ“Š Sections processed: {sections_processed}\")\n", + "print(f\" ๐Ÿ”ง Dynamic Few-Shot used: {'Yes' if dynamic_few_shot_used else 'No'}\")\n", + "\n", + "if dynamic_few_shot_used and 'few_shot_extraction_time' in locals() and examples_extraction_time:\n", + " print(f\" โฑ๏ธ Performance overhead: {few_shot_extraction_time - default_extraction_time:.2f}s\")\n", + " print(f\" ๐Ÿ“ˆ Accuracy improvement: Enhanced with few-shot examples\")\n", + "\n", + "print(f\"\\n๐Ÿš€ TO IMPLEMENT DYNAMIC FEW-SHOT IN PRODUCTION:\")\n", + "print(f\" 1. ๐Ÿ“ Deploy dynamic few-shot Lambda stack\")\n", + "print(f\" 2. ๐Ÿ“Š Populate S3 Vectors index with example documents\")\n", + "print(f\" 3. โš™๏ธ Add 'dynamic_few_shot_lambda_arn' to extraction config\")\n", + "print(f\" 4. ๐Ÿงช Test with your actual documents and use cases\")\n", + "print(f\" 5. ๐Ÿ“Š Monitor CloudWatch logs for performance and accuracy\")\n", + "\n", + "print(f\"\\n๐Ÿ“š RESOURCES:\")\n", + "print(f\" ๐Ÿ“– Documentation: notebooks/examples/dynamic-few-shot-lambda/README.md\")\n", + "print(f\" ๐Ÿ”ง Lambda Function: notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py\")\n", + "print(f\" โ˜๏ธ Deploy: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n", + "print(f\" ๐Ÿ“Š Import Dataset: notebooks/misc/fewshot_dataset_import.ipynb\")\n", + "\n", + "print(f\"\\n๐Ÿ“Œ CONTINUE TO: step4_assessment.ipynb\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From ae2a9250e559a5c553b39b74a3da14ea104086b2 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 2 Dec 2025 09:54:42 +0100 Subject: [PATCH 09/39] chore: placeholder bucket name --- notebooks/misc/fewshot_dataset_import.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/notebooks/misc/fewshot_dataset_import.ipynb index baca9464..dc56a646 100644 --- a/notebooks/misc/fewshot_dataset_import.ipynb +++ b/notebooks/misc/fewshot_dataset_import.ipynb @@ -105,7 +105,7 @@ "outputs": [], "source": [ "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n", - "S3_BUCKET_FOR_IMAGES = \"genaiidp-dynamic-few-shot-dynamicfewshotdatasetbuc-nuz4jeue5hds\" # Stack output 'DynamicFewShotDatasetBucket'\n", + "S3_BUCKET_FOR_IMAGES = \"\" # Stack output 'DynamicFewShotDatasetBucket'\n", "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n", "S3_VECTORS_INDEX = \"documents\"\n", "\n", From bd52a2260b17d2b06e03480e4a36720550adfb3c Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 2 Dec 2025 10:02:58 +0100 Subject: [PATCH 10/39] chore: clarify distance --- .../GENAIIDP-dynamic-few-shot.py | 11 +++++++---- notebooks/examples/dynamic-few-shot-lambda/README.md | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index 61b69295..a97fff33 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -45,8 +45,8 @@ def lambda_handler(event, context): Input event: { - "class_label": "", - "document_text": "", + "class_label": "", + "document_text": "", "image_content": ["", "", ...] } @@ -55,7 +55,7 @@ def lambda_handler(event, context): { "attributes_prompt": "expected attributes are: ...", "class_prompt": "This is an example of the class 'invoice'", - "distance": 0.892344521145, + "distance": 0.122344521145, "image_content": ["", "", ...] } ] @@ -138,7 +138,10 @@ def _s3vectors_find_similar_items(image_data): attributes = _extract_metadata(metadata, distance) result.append(attributes) - return result + # sort results by distance score (lowest to highest - lower is more similar) + sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False) + + return sorted_result def _s3vectors_find_similar_items_from_image(page_image): """Search for similar items using image query""" diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/notebooks/examples/dynamic-few-shot-lambda/README.md index e30f913e..098be753 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/README.md +++ b/notebooks/examples/dynamic-few-shot-lambda/README.md @@ -117,7 +117,7 @@ extraction: { "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...", "class_prompt": "This is an example of the class 'invoice'", - "distance": 0.892344521145, + "distance": 0.122344521145, # lower is more similar "image_content": ["", "", ...] } ] From 289386bc94a4d7ea7516a7bbf77f0e988a2f5431 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 2 Dec 2025 14:05:40 +0100 Subject: [PATCH 11/39] chore: debug log for S3 vectors result --- .../dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index a97fff33..7f234c6d 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -158,6 +158,7 @@ def _s3vectors_find_similar_items_from_image(page_image): returnDistance=True, returnMetadata=True ) + logger.debug(f"S3 vectors lookup result: {response['vectors']}") return response["vectors"] def _merge_examples(examples, new_examples): From 6fd1b5ee4be30d5d5f9a06752718a6880e71f0ef Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 2 Dec 2025 14:05:57 +0100 Subject: [PATCH 12/39] chore: filter S3 vectors result by threshold --- .../GENAIIDP-dynamic-few-shot.py | 17 +++++++++++++++-- .../dynamic-few-shot-lambda/template.yml | 7 +++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index 7f234c6d..495899bc 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -31,6 +31,7 @@ S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS']) MODEL_ID = os.environ['MODEL_ID'] TOP_K = int(os.environ['TOP_K']) + THRESHOLD = float(os.environ['THRESHOLD']) except (KeyError, ValueError, IndexError) as e: logger.error(f"Failed to parse environment variables: {e}") raise @@ -64,7 +65,7 @@ def lambda_handler(event, context): try: logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===") logger.debug(f"Complete input event: {json.dumps(event, indent=2)}") - + # Validate input class_label = event.get("class_label") document_text = event.get("document_text") @@ -78,6 +79,8 @@ def lambda_handler(event, context): # Decode input data image_data = _decode_images(image_content) + logger.info(f"=== FIND SIMILAR ITEMS ===") + # Find similar items using S3 vectors lookup from image similarity result = _s3vectors_find_similar_items(image_data) @@ -141,7 +144,17 @@ def _s3vectors_find_similar_items(image_data): # sort results by distance score (lowest to highest - lower is more similar) sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False) - return sorted_result + # filter result by distance score + filtered_result = [] + for example in sorted_result: + if example['distance'] > THRESHOLD: + logger.info( + f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}" + ) + else: + filtered_result.append(example) + + return filtered_result def _s3vectors_find_similar_items_from_image(page_image): """Search for similar items using image query""" diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/notebooks/examples/dynamic-few-shot-lambda/template.yml index 927c9a65..2c5158da 100644 --- a/notebooks/examples/dynamic-few-shot-lambda/template.yml +++ b/notebooks/examples/dynamic-few-shot-lambda/template.yml @@ -35,6 +35,12 @@ Parameters: TopK: Type: Number Default: 2 + Description: The number of results to return for each S3 vectors query. + + Threshold: + Type: Number + Default: 0.2 + Description: Filter results exceeding this similarity threshold (lower is more similar) LambdaFunctionName: Type: String @@ -79,6 +85,7 @@ Resources: S3VECTOR_DIMENSIONS: !Ref VectorDimensions MODEL_ID: !Ref ModelId TOP_K: !Ref TopK + THRESHOLD: !Ref Threshold LoggingConfig: LogGroup: !Ref DynamicFewShotLogGroup # Minimal permissions - only needs basic execution and logging From 8d16da197f09deebdeea14d27569a54e19804742 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 4 Dec 2025 09:01:50 +0100 Subject: [PATCH 13/39] chore: add comment on PIL requirement for generate_embedding --- lib/idp_common_pkg/idp_common/bedrock/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py index 9167a4ce..7e14d4bb 100644 --- a/lib/idp_common_pkg/idp_common/bedrock/client.py +++ b/lib/idp_common_pkg/idp_common/bedrock/client.py @@ -733,6 +733,7 @@ def generate_embedding( Returns: List of floats representing the embedding vector """ + # requires PIL from idp_common.image import ( prepare_image, prepare_bedrock_image_attachment From 0b7a57d4dbbd561e3f61572e65f82e2a7a1e651b Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 14:54:13 +0100 Subject: [PATCH 14/39] chore: move dynamic-few-shot to plugins folder --- plugins/dynamic-few-shot-lambda/.gitignore | 1 + .../dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py | 0 .../examples => plugins}/dynamic-few-shot-lambda/README.md | 0 .../notebooks}/config/extraction_with_few_shot.yaml | 0 .../notebooks}/fewshot_dataset_import.ipynb | 0 .../notebooks}/step3_extraction_with_dynamic_few_shot.ipynb | 0 .../dynamic-few-shot-lambda/requirements.txt | 0 .../examples => plugins}/dynamic-few-shot-lambda/samconfig.toml | 0 .../examples => plugins}/dynamic-few-shot-lambda/template.yml | 0 9 files changed, 1 insertion(+) create mode 100644 plugins/dynamic-few-shot-lambda/.gitignore rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py (100%) rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/README.md (100%) rename {notebooks/examples => plugins/dynamic-few-shot-lambda/notebooks}/config/extraction_with_few_shot.yaml (100%) rename {notebooks/misc => plugins/dynamic-few-shot-lambda/notebooks}/fewshot_dataset_import.ipynb (100%) rename {notebooks/examples => plugins/dynamic-few-shot-lambda/notebooks}/step3_extraction_with_dynamic_few_shot.ipynb (100%) rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/requirements.txt (100%) rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/samconfig.toml (100%) rename {notebooks/examples => plugins}/dynamic-few-shot-lambda/template.yml (100%) diff --git a/plugins/dynamic-few-shot-lambda/.gitignore b/plugins/dynamic-few-shot-lambda/.gitignore new file mode 100644 index 00000000..c0190e10 --- /dev/null +++ b/plugins/dynamic-few-shot-lambda/.gitignore @@ -0,0 +1 @@ +datasets \ No newline at end of file diff --git a/notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py similarity index 100% rename from notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py rename to plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py diff --git a/notebooks/examples/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md similarity index 100% rename from notebooks/examples/dynamic-few-shot-lambda/README.md rename to plugins/dynamic-few-shot-lambda/README.md diff --git a/notebooks/examples/config/extraction_with_few_shot.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml similarity index 100% rename from notebooks/examples/config/extraction_with_few_shot.yaml rename to plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml diff --git a/notebooks/misc/fewshot_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb similarity index 100% rename from notebooks/misc/fewshot_dataset_import.ipynb rename to plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb diff --git a/notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb similarity index 100% rename from notebooks/examples/step3_extraction_with_dynamic_few_shot.ipynb rename to plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb diff --git a/notebooks/examples/dynamic-few-shot-lambda/requirements.txt b/plugins/dynamic-few-shot-lambda/requirements.txt similarity index 100% rename from notebooks/examples/dynamic-few-shot-lambda/requirements.txt rename to plugins/dynamic-few-shot-lambda/requirements.txt diff --git a/notebooks/examples/dynamic-few-shot-lambda/samconfig.toml b/plugins/dynamic-few-shot-lambda/samconfig.toml similarity index 100% rename from notebooks/examples/dynamic-few-shot-lambda/samconfig.toml rename to plugins/dynamic-few-shot-lambda/samconfig.toml diff --git a/notebooks/examples/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml similarity index 100% rename from notebooks/examples/dynamic-few-shot-lambda/template.yml rename to plugins/dynamic-few-shot-lambda/template.yml From 035b28c2e2b0fcbf493ee1caf30e4f7270bb1fcf Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 15:11:03 +0100 Subject: [PATCH 15/39] chore: ignore datasets folder --- plugins/dynamic-few-shot-lambda/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/.gitignore b/plugins/dynamic-few-shot-lambda/.gitignore index c0190e10..f3c07f0d 100644 --- a/plugins/dynamic-few-shot-lambda/.gitignore +++ b/plugins/dynamic-few-shot-lambda/.gitignore @@ -1 +1 @@ -datasets \ No newline at end of file +datasets/ \ No newline at end of file From 854fa8b23e9d3bd4a9ea9d612959bc1239850256 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 15:13:38 +0100 Subject: [PATCH 16/39] chore: ruff format --- .../GENAIIDP-dynamic-few-shot.py | 63 +++++++++++-------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py index 495899bc..f2b6edb9 100644 --- a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py @@ -26,20 +26,21 @@ # Parse environment variables with error handling try: - S3VECTOR_BUCKET = os.environ['S3VECTOR_BUCKET'] - S3VECTOR_INDEX = os.environ['S3VECTOR_INDEX'] - S3VECTOR_DIMENSIONS = int(os.environ['S3VECTOR_DIMENSIONS']) - MODEL_ID = os.environ['MODEL_ID'] - TOP_K = int(os.environ['TOP_K']) - THRESHOLD = float(os.environ['THRESHOLD']) + S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"] + S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"] + S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"]) + MODEL_ID = os.environ["MODEL_ID"] + TOP_K = int(os.environ["TOP_K"]) + THRESHOLD = float(os.environ["THRESHOLD"]) except (KeyError, ValueError, IndexError) as e: logger.error(f"Failed to parse environment variables: {e}") raise # Initialize clients -s3vectors = boto3.client('s3vectors') +s3vectors = boto3.client("s3vectors") bedrock_client = bedrock.BedrockClient() + def lambda_handler(event, context): """ Process a document to find similar examples using S3 Vectors similarity search. @@ -61,7 +62,7 @@ def lambda_handler(event, context): } ] """ - + try: logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===") logger.debug(f"Complete input event: {json.dumps(event, indent=2)}") @@ -73,7 +74,9 @@ def lambda_handler(event, context): logger.info(f"=== INPUT VALUES ===") logger.info(f"Class label: {class_label if class_label else 'Not specified'}") - logger.info(f"Document text: {len(document_text) if document_text else "0"} bytes") + logger.info( + f"Document text: {len(document_text) if document_text else '0'} bytes" + ) logger.info(f"Image content: {len(image_content)} images") # Decode input data @@ -91,15 +94,18 @@ def lambda_handler(event, context): logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===") return result - + except Exception as e: logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===") logger.error(f"Error type: {type(e).__name__}") logger.error(f"Error message: {str(e)}") - logger.error(f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}") + logger.error( + f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}" + ) # In demo, we'll fail gracefully with detailed error info raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}") + def _decode_images(image_content): """Base64 decode image content to bytes""" result = [] @@ -108,14 +114,16 @@ def _decode_images(image_content): result.append(image_data) return result + def _encode_images(image_content): """Base64 encode image content to JSON-serializable string""" result = [] for image_bytes in image_content: - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - result.append(image_base64) + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + result.append(image_base64) return result + def _s3vectors_find_similar_items(image_data): """Find similar items for input""" # find similar items based on image similarity only @@ -133,21 +141,21 @@ def _s3vectors_find_similar_items(image_data): # Only process this example if it has a non-empty attributesPrompt if not attributes_prompt or not attributes_prompt.strip(): - logger.info( - f"Skipping example with empty attributesPrompt: {key}" - ) + logger.info(f"Skipping example with empty attributesPrompt: {key}") continue attributes = _extract_metadata(metadata, distance) result.append(attributes) # sort results by distance score (lowest to highest - lower is more similar) - sorted_result = sorted(result, key=lambda example: example['distance'], reverse=False) + sorted_result = sorted( + result, key=lambda example: example["distance"], reverse=False + ) # filter result by distance score filtered_result = [] for example in sorted_result: - if example['distance'] > THRESHOLD: + if example["distance"] > THRESHOLD: logger.info( f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}" ) @@ -156,6 +164,7 @@ def _s3vectors_find_similar_items(image_data): return filtered_result + def _s3vectors_find_similar_items_from_image(page_image): """Search for similar items using image query""" embedding = bedrock_client.generate_embedding( @@ -169,11 +178,12 @@ def _s3vectors_find_similar_items_from_image(page_image): queryVector={"float32": embedding}, topK=TOP_K, returnDistance=True, - returnMetadata=True + returnMetadata=True, ) logger.debug(f"S3 vectors lookup result: {response['vectors']}") return response["vectors"] + def _merge_examples(examples, new_examples): """ Merge in-place new examples into the result list, avoiding duplicates. @@ -185,7 +195,7 @@ def _merge_examples(examples, new_examples): for new_example in new_examples: key = new_example["key"] new_distance = new_example.get("distance", 1.0) - + # update example if examples.get(key): existing_distance = examples[key].get("distance", 1.0) @@ -195,9 +205,10 @@ def _merge_examples(examples, new_examples): else: examples[key] = { "distance": new_distance, - "metadata": new_example.get("metadata") + "metadata": new_example.get("metadata"), } + def _extract_metadata(metadata, distance): """Create result object from S3 vectors metadata""" # Result object attributes @@ -215,6 +226,7 @@ def _extract_metadata(metadata, distance): return attributes + def _get_image_data_from_s3_path(image_path): """ Load images from image path @@ -237,9 +249,7 @@ def _get_image_data_from_s3_path(image_path): # Direct S3 URI image_bytes = s3.get_binary_content(image_file_path) else: - raise ValueError( - f"Invalid file path {image_path} - expecting S3 path" - ) + raise ValueError(f"Invalid file path {image_path} - expecting S3 path") image_content.append(image_bytes) except Exception as e: @@ -248,6 +258,7 @@ def _get_image_data_from_s3_path(image_path): return image_content + def _get_image_files_from_s3_path(image_path): """ Get list of image files from an S3 path. @@ -260,9 +271,7 @@ def _get_image_files_from_s3_path(image_path): """ # Handle S3 URIs if not image_path.startswith("s3://"): - raise ValueError( - f"Invalid file path {image_path} - expecting S3 URI" - ) + raise ValueError(f"Invalid file path {image_path} - expecting S3 URI") # Check if it's a direct file or a prefix if image_path.endswith( From b5f88732ac803a0e442115b557b7cf800826a968 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 23:38:43 +0100 Subject: [PATCH 17/39] feat: update dynamic-few-shot Lambda to implement Custom Prompt Lambda interface --- .../GENAIIDP-dynamic-few-shot.py | 284 ------------ plugins/dynamic-few-shot-lambda/README.md | 6 +- .../dynamic-few-shot-lambda/requirements.txt | 1 - .../src/GENAIIDP-dynamic-few-shot.py | 416 ++++++++++++++++++ .../src/requirements.txt | 1 + plugins/dynamic-few-shot-lambda/template.yml | 21 +- 6 files changed, 440 insertions(+), 289 deletions(-) delete mode 100644 plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py delete mode 100644 plugins/dynamic-few-shot-lambda/requirements.txt create mode 100644 plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py create mode 100644 plugins/dynamic-few-shot-lambda/src/requirements.txt diff --git a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py deleted file mode 100644 index f2b6edb9..00000000 --- a/plugins/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py +++ /dev/null @@ -1,284 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 - -""" -Lambda function to provide examples with ground truth data based on S3 Vectors lookup. - -Key Features Demonstrated: -- Dynamically retrieve similar examples based on document content using vector similarity search -- Provide few-shot examples to improve extraction accuracy through example-based prompting -- Leverage S3 Vectors for efficient similarity search across large example datasets -- Integrate multimodal embeddings using Amazon Nova models for image-based similarity -- Customize example selection based on document characteristics and business rules -""" - -import json -import logging -import base64 -import boto3 -import os - -from idp_common import bedrock, s3 - -logger = logging.getLogger(__name__) -level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")) -logger.setLevel(level) - -# Parse environment variables with error handling -try: - S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"] - S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"] - S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"]) - MODEL_ID = os.environ["MODEL_ID"] - TOP_K = int(os.environ["TOP_K"]) - THRESHOLD = float(os.environ["THRESHOLD"]) -except (KeyError, ValueError, IndexError) as e: - logger.error(f"Failed to parse environment variables: {e}") - raise - -# Initialize clients -s3vectors = boto3.client("s3vectors") -bedrock_client = bedrock.BedrockClient() - - -def lambda_handler(event, context): - """ - Process a document to find similar examples using S3 Vectors similarity search. - - Input event: - { - "class_label": "", - "document_text": "", - "image_content": ["", "", ...] - } - - Return format: - [ - { - "attributes_prompt": "expected attributes are: ...", - "class_prompt": "This is an example of the class 'invoice'", - "distance": 0.122344521145, - "image_content": ["", "", ...] - } - ] - """ - - try: - logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===") - logger.debug(f"Complete input event: {json.dumps(event, indent=2)}") - - # Validate input - class_label = event.get("class_label") - document_text = event.get("document_text") - image_content = event.get("image_content", []) - - logger.info(f"=== INPUT VALUES ===") - logger.info(f"Class label: {class_label if class_label else 'Not specified'}") - logger.info( - f"Document text: {len(document_text) if document_text else '0'} bytes" - ) - logger.info(f"Image content: {len(image_content)} images") - - # Decode input data - image_data = _decode_images(image_content) - - logger.info(f"=== FIND SIMILAR ITEMS ===") - - # Find similar items using S3 vectors lookup from image similarity - result = _s3vectors_find_similar_items(image_data) - - # Log complete output structure - logger.info(f"=== OUTPUT ANALYSIS ===") - logger.debug(f"Complete result: {json.dumps(result, indent=2)}") - logger.info(f"Output items: {len(result)}") - - logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===") - return result - - except Exception as e: - logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===") - logger.error(f"Error type: {type(e).__name__}") - logger.error(f"Error message: {str(e)}") - logger.error( - f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}" - ) - # In demo, we'll fail gracefully with detailed error info - raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}") - - -def _decode_images(image_content): - """Base64 decode image content to bytes""" - result = [] - for image_base64 in image_content: - image_data = base64.b64decode(image_base64) - result.append(image_data) - return result - - -def _encode_images(image_content): - """Base64 encode image content to JSON-serializable string""" - result = [] - for image_bytes in image_content: - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - result.append(image_base64) - return result - - -def _s3vectors_find_similar_items(image_data): - """Find similar items for input""" - # find similar items based on image similarity only - similar_items = {} - for page_image in image_data: - result = _s3vectors_find_similar_items_from_image(page_image) - _merge_examples(similar_items, result) - - # create result set - result = [] - for key, example in similar_items.items(): - metadata = example.get("metadata", {}) - distance = example.get("distance") - attributes_prompt = metadata.get("attributesPrompt") - - # Only process this example if it has a non-empty attributesPrompt - if not attributes_prompt or not attributes_prompt.strip(): - logger.info(f"Skipping example with empty attributesPrompt: {key}") - continue - - attributes = _extract_metadata(metadata, distance) - result.append(attributes) - - # sort results by distance score (lowest to highest - lower is more similar) - sorted_result = sorted( - result, key=lambda example: example["distance"], reverse=False - ) - - # filter result by distance score - filtered_result = [] - for example in sorted_result: - if example["distance"] > THRESHOLD: - logger.info( - f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}" - ) - else: - filtered_result.append(example) - - return filtered_result - - -def _s3vectors_find_similar_items_from_image(page_image): - """Search for similar items using image query""" - embedding = bedrock_client.generate_embedding( - image_source=page_image, - model_id=MODEL_ID, - dimensions=S3VECTOR_DIMENSIONS, - ) - response = s3vectors.query_vectors( - vectorBucketName=S3VECTOR_BUCKET, - indexName=S3VECTOR_INDEX, - queryVector={"float32": embedding}, - topK=TOP_K, - returnDistance=True, - returnMetadata=True, - ) - logger.debug(f"S3 vectors lookup result: {response['vectors']}") - return response["vectors"] - - -def _merge_examples(examples, new_examples): - """ - Merge in-place new examples into the result list, avoiding duplicates. - - Args: - examples: Dict of existing examples - new_examples: List of new examples to be merged - """ - for new_example in new_examples: - key = new_example["key"] - new_distance = new_example.get("distance", 1.0) - - # update example - if examples.get(key): - existing_distance = examples[key].get("distance", 1.0) - examples[key]["distance"] = min(new_distance, existing_distance) - examples[key]["metadata"] = new_example.get("metadata") - # insert example - else: - examples[key] = { - "distance": new_distance, - "metadata": new_example.get("metadata"), - } - - -def _extract_metadata(metadata, distance): - """Create result object from S3 vectors metadata""" - # Result object attributes - attributes = { - "attributes_prompt": metadata.get("attributesPrompt"), - "class_prompt": metadata.get("classPrompt"), - "distance": distance, - } - - image_path = metadata.get("imagePath") - if image_path: - image_data = _get_image_data_from_s3_path(image_path) - encoded_images = _encode_images(image_data) - attributes["image_content"] = encoded_images - - return attributes - - -def _get_image_data_from_s3_path(image_path): - """ - Load images from image path - - Args: - image_path: Path to image file, directory, or S3 prefix - - Returns: - List of images (bytes) - """ - # Get list of image files from the path (supports directories/prefixes) - image_files = _get_image_files_from_s3_path(image_path) - image_content = [] - - # Process each image file - for image_file_path in image_files: - try: - # Load image content - if image_file_path.startswith("s3://"): - # Direct S3 URI - image_bytes = s3.get_binary_content(image_file_path) - else: - raise ValueError(f"Invalid file path {image_path} - expecting S3 path") - - image_content.append(image_bytes) - except Exception as e: - logger.warning(f"Failed to load image {image_file_path}: {e}") - continue - - return image_content - - -def _get_image_files_from_s3_path(image_path): - """ - Get list of image files from an S3 path. - - Args: - image_path: Path to image file, directory, or S3 prefix - - Returns: - List of image file paths/URIs sorted by filename - """ - # Handle S3 URIs - if not image_path.startswith("s3://"): - raise ValueError(f"Invalid file path {image_path} - expecting S3 URI") - - # Check if it's a direct file or a prefix - if image_path.endswith( - (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp") - ): - # Direct S3 file - return [image_path] - else: - # S3 prefix - list all images - return s3.list_images_from_path(image_path) diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md index 098be753..a400f61a 100644 --- a/plugins/dynamic-few-shot-lambda/README.md +++ b/plugins/dynamic-few-shot-lambda/README.md @@ -67,7 +67,7 @@ flowchart TD ```bash # Navigate to the dynamic-few-shot-lambda directory -cd notebooks/examples/dynamic-few-shot-lambda +cd plugins/dynamic-few-shot-lambda # Deploy using AWS SAM sam deploy --guided @@ -86,7 +86,7 @@ aws cloudformation describe-stacks \ ### Step 3: Populate the Examples Dataset -Use the [fewshot_dataset_import.ipynb](../../misc/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack. +Use the [fewshot_dataset_import.ipynb](notebooks/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack. ### Step 4: Configure IDP to Use Dynamic-few shot @@ -94,7 +94,7 @@ Add the Lambda ARN to your IDP extraction configuration: ```yaml extraction: - dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" + custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" ``` ## Lambda Interface diff --git a/plugins/dynamic-few-shot-lambda/requirements.txt b/plugins/dynamic-few-shot-lambda/requirements.txt deleted file mode 100644 index 2048c02c..00000000 --- a/plugins/dynamic-few-shot-lambda/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -../../../lib/idp_common_pkg[extraction,docs_service] # extraction module and document service with dependencies diff --git a/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py new file mode 100644 index 00000000..49aab6c3 --- /dev/null +++ b/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py @@ -0,0 +1,416 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +""" +Lambda function to provide examples with ground truth data based on S3 Vectors lookup. + +Key Features Demonstrated: +- Dynamically retrieve similar examples based on document content using vector similarity search +- Provide few-shot examples to improve extraction accuracy through example-based prompting +- Leverage S3 Vectors for efficient similarity search across large example datasets +- Integrate multimodal embeddings using Amazon Nova models for image-based similarity +- Customize example selection based on document characteristics and business rules +""" + +import json +import logging +import base64 +import boto3 +import os + +from idp_common import bedrock, s3 +from idp_common.bedrock import format_prompt + +from typing import Any + +logger = logging.getLogger(__name__) +level = logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO")) +logger.setLevel(level) + +# Parse environment variables with error handling +try: + S3VECTOR_BUCKET = os.environ["S3VECTOR_BUCKET"] + S3VECTOR_INDEX = os.environ["S3VECTOR_INDEX"] + S3VECTOR_DIMENSIONS = int(os.environ["S3VECTOR_DIMENSIONS"]) + MODEL_ID = os.environ["MODEL_ID"] + TOP_K = int(os.environ["TOP_K"]) + THRESHOLD = float(os.environ["THRESHOLD"]) +except (KeyError, ValueError, IndexError) as e: + logger.error(f"Failed to parse environment variables: {e}") + raise + +# Initialize clients +s3vectors = boto3.client("s3vectors") +bedrock_client = bedrock.BedrockClient() + + +def lambda_handler(event, context): + """ + Process a document to find similar examples using S3 Vectors similarity search. + This function will expand {FEW_SHOT_EXAMPLES} in the extraction prompt to examples + found in S3 Vectors lookup. + """ + + try: + logger.info("=== DYNAMIC FEW-SHOT LAMBDA INVOKED ===") + logger.debug(f"Complete input event: {json.dumps(event, indent=2)}") + + # Extract key information from the payload + config = event.get("config", {}) + placeholders = event.get("prompt_placeholders", {}) + default_content = event.get("default_task_prompt_content", []) + document = event.get("serialized_document", {}) + + document_class = placeholders.get("DOCUMENT_CLASS", "") + document_text = placeholders.get("DOCUMENT_TEXT", "") + document_image_uris = placeholders.get("DOCUMENT_IMAGE", []) + document_id = document.get("id", "unknown") + + # Log extraction config details + extraction_config = config.get("extraction", {}) + logger.info(f"=== EXTRACTION CONFIG ===") + logger.info(f"Model: {extraction_config.get('model', 'Not specified')}") + logger.info(f"Temperature: {extraction_config.get('temperature', 'Not specified')}") + logger.info(f"Max tokens: {extraction_config.get('max_tokens', 'Not specified')}") + logger.info(f"Custom Lambda ARN: {extraction_config.get('custom_prompt_lambda_arn', 'Not specified')}") + + # Default system prompt from config + default_system_prompt = config.get("extraction", {}).get("system_prompt", "") + logger.info(f"Default system prompt length: {len(default_system_prompt)} characters") + default_task_prompt = config.get("extraction", {}).get("task_prompt", "") + logger.info(f"Default task prompt length: {len(default_task_prompt)} characters") + + logger.info(f"=== HANDLE INPUT DOCUMENT ===") + + # Handle input document + result = _handle_input_document(placeholders, default_system_prompt, default_task_prompt) + + # Log complete output structure + logger.info(f"=== OUTPUT ANALYSIS ===") + logger.info(f"Output keys: {list(result.keys())}") + logger.info(f"System prompt length: {len(result.get('system_prompt', ''))}") + logger.info(f"System prompt (first 200 chars): {result.get('system_prompt', '')[:200]}...") + + task_content = result.get('task_prompt_content', []) + logger.info(f"Task prompt content items: {len(task_content)}") + for i, item in enumerate(task_content[:3]): # Log first 3 items + logger.info(f"Content item {i}: keys={list(item.keys())}") + if 'text' in item: + logger.info(f" Text length: {len(item['text'])} characters") + logger.info(f" Text sample (first 150 chars): {item['text'][:150]}...") + if 'image_uri' in item: + logger.info(f" Image URI: {item['image_uri']}") + + if len(task_content) > 3: + logger.info(f" ... and {len(task_content) - 3} more content items") + + logger.debug(f"Complete result output: {json.dumps(result, indent=2)}") + logger.info("=== DYNAMIC FEW-SHOT LAMBDA COMPLETED ===") + return result + + except Exception as e: + logger.error(f"=== DYNAMIC FEW-SHOT LAMBDA ERROR ===") + logger.error(f"Error type: {type(e).__name__}") + logger.error(f"Error message: {str(e)}") + logger.error( + f"Input event keys: {list(event.keys()) if 'event' in locals() else 'Unknown'}" + ) + # In demo, we'll fail gracefully with detailed error info + raise Exception(f"Dynamic few-shot Lambda failed: {str(e)}") + +def _handle_input_document(placeholders, default_system_prompt, default_task_prompt): + """ + Handle input request and return custom system_prompt and task_prompt_content + """ + substitutions = { + "DOCUMENT_TEXT": placeholders.get("DOCUMENT_TEXT"), + "DOCUMENT_CLASS": placeholders.get("DOCUMENT_CLASS"), + "ATTRIBUTE_NAMES_AND_DESCRIPTIONS": placeholders.get("ATTRIBUTE_NAMES_AND_DESCRIPTIONS") + } + task_prompt_content = _build_prompt_content( + default_task_prompt, substitutions, placeholders.get("DOCUMENT_IMAGE") + ) + + return { + "system_prompt": default_system_prompt, + "task_prompt_content": task_prompt_content + } + + +def _build_prompt_content( + prompt_template: str, + substitutions: dict[str, Any], + image_content: Any = None, +) -> list[dict[str, Any]]: + """ + Build prompt content array handling FEW_SHOT_EXAMPLES and DOCUMENT_IMAGE placeholders. + + This consolidated method handles all placeholder types and combinations: + - {FEW_SHOT_EXAMPLES}: Inserts few-shot examples from config + - {DOCUMENT_IMAGE}: Inserts images at specific location + - Regular text placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc. + + Args: + prompt_template: The prompt template with optional placeholders + substitutions: Dictionary of placeholder values + image_content: Optional image content to insert (only used with {DOCUMENT_IMAGE}) + + Returns: + List of content items with text and image content properly ordered + """ + content: list[dict[str, Any]] = [] + + # Handle FEW_SHOT_EXAMPLES placeholder first + if "{FEW_SHOT_EXAMPLES}" in prompt_template: + parts = prompt_template.split("{FEW_SHOT_EXAMPLES}") + if len(parts) == 2: + # Process before examples + content.extend( + _build_text_and_image_content(parts[0], substitutions, image_content) + ) + + # Add few-shot examples + content.extend(_build_few_shot_examples_content(image_content)) + + # Process after examples (only pass images if not already used) + image_for_after = ( + None if "{DOCUMENT_IMAGE}" in parts[0] else image_content + ) + content.extend( + _build_text_and_image_content(parts[1], substitutions, image_for_after) + ) + + return content + + # No FEW_SHOT_EXAMPLES, just handle text and images + logger.warn("Missing {FEW_SHOT_EXAMPLES} placeholder in prompt template") + return _build_text_and_image_content(prompt_template, substitutions, image_content) + + +def _build_text_and_image_content( + prompt_template: str, + substitutions: dict[str, Any], + image_content: Any = None, +) -> list[dict[str, Any]]: + """ + Build content array with text and optionally images based on DOCUMENT_IMAGE placeholder. + + Args: + prompt_template: Template that may contain {DOCUMENT_IMAGE} + substitutions: Dictionary of placeholder values + image_content: Optional image content + + Returns: + List of content items + """ + content: list[dict[str, Any]] = [] + + if "{DOCUMENT_IMAGE}" in prompt_template: + parts = prompt_template.split("{DOCUMENT_IMAGE}") + if len(parts) == 2: + # Add text before image + before_text = _prepare_prompt_from_template( + parts[0], substitutions, required_placeholders=[] + ) + if before_text.strip(): + content.append({"text": before_text}) + + # Add images + if image_content: + for image_uri in image_content: + content.append({"image_uri": image_uri}) + + # Add text after image + after_text = _prepare_prompt_from_template( + parts[1], substitutions, required_placeholders=[] + ) + if after_text.strip(): + content.append({"text": after_text}) + + return content + else: + logger.warning("Invalid DOCUMENT_IMAGE placeholder usage") + + # No image placeholder, just text + task_prompt = _prepare_prompt_from_template( + prompt_template, substitutions, required_placeholders=[] + ) + content.append({"text": task_prompt}) + + return content + + +def _build_few_shot_examples_content(image_content: Any = None) -> list[dict[str, Any]]: + """ + Build content items for few-shot examples from the configuration for a specific class. + + Args: + image_content: Optional document image content + + Returns: + List of content items containing text and image content for examples + """ + content: list[dict[str, Any]] = [] + + image_data = [] + if image_content: + for image_uri in image_content: + # Load image content + if image_uri.startswith("s3://"): + # Direct S3 URI + image_bytes = s3.get_binary_content(image_uri) + else: + raise ValueError(f"Invalid file path {image_path} - expecting S3 path") + + image_data.append(image_bytes) + + examples = _s3vectors_find_similar_items(image_data) + for example in examples: + content.append({"text": example.get("attributesPrompt")}) + + for image_uri in example.get("imageFiles", []): + content.append({"image_uri": image_uri}) + + return content + + +def _prepare_prompt_from_template(prompt_template, substitutions, required_placeholders): + """ + Prepare prompt from template by replacing placeholders with values. + + Args: + prompt_template: The prompt template with placeholders + substitutions: Dictionary of placeholder values + required_placeholders: List of placeholder names that must be present in the template + + Returns: + String with placeholders replaced by values + + Raises: + ValueError: If a required placeholder is missing from the template + """ + + return format_prompt(prompt_template, substitutions, required_placeholders) + + +def _s3vectors_find_similar_items(image_data): + """Find similar items for input""" + # find similar items based on image similarity only + similar_items = {} + for page_image in image_data: + result = _s3vectors_find_similar_items_from_image(page_image) + _merge_examples(similar_items, result) + + # create result set + result = [] + for key, example in similar_items.items(): + metadata = example.get("metadata", {}) + distance = example.get("distance") + attributes_prompt = metadata.get("attributesPrompt") + + # Only process this example if it has a non-empty attributesPrompt + if not attributes_prompt or not attributes_prompt.strip(): + logger.info(f"Skipping example with empty attributesPrompt: {key}") + continue + + attributes = _extract_metadata(metadata, distance) + result.append(attributes) + + # sort results by distance score (lowest to highest - lower is more similar) + sorted_result = sorted( + result, key=lambda example: example["distance"], reverse=False + ) + + # filter result by distance score + filtered_result = [] + for example in sorted_result: + if example["distance"] > THRESHOLD: + logger.info( + f"Skipping example with distance {example['distance']} above threshold {THRESHOLD}: {key}" + ) + else: + filtered_result.append(example) + + return filtered_result + + +def _s3vectors_find_similar_items_from_image(page_image): + """Search for similar items using image query""" + embedding = bedrock_client.generate_embedding( + image_source=page_image, + model_id=MODEL_ID, + dimensions=S3VECTOR_DIMENSIONS, + ) + response = s3vectors.query_vectors( + vectorBucketName=S3VECTOR_BUCKET, + indexName=S3VECTOR_INDEX, + queryVector={"float32": embedding}, + topK=TOP_K, + returnDistance=True, + returnMetadata=True, + ) + logger.debug(f"S3 vectors lookup result: {response['vectors']}") + return response["vectors"] + + +def _merge_examples(examples, new_examples): + """ + Merge in-place new examples into the result list, avoiding duplicates. + + Args: + examples: Dict of existing examples + new_examples: List of new examples to be merged + """ + for new_example in new_examples: + key = new_example["key"] + new_distance = new_example.get("distance", 1.0) + + # update example + if examples.get(key): + existing_distance = examples[key].get("distance", 1.0) + examples[key]["distance"] = min(new_distance, existing_distance) + examples[key]["metadata"] = new_example.get("metadata") + # insert example + else: + examples[key] = { + "distance": new_distance, + "metadata": new_example.get("metadata"), + } + + +def _extract_metadata(metadata, distance): + """Create result object from S3 vectors metadata""" + # Result object attributes + attributes = { + "attributesPrompt": metadata.get("attributesPrompt"), + "classPrompt": metadata.get("classPrompt"), + "imageFiles": _get_image_files_from_s3_path(metadata.get("imagePath")), + "distance": distance, + } + + return attributes + + +def _get_image_files_from_s3_path(image_path): + """ + Get list of image files from an S3 path. + + Args: + image_path: Path to image file, directory, or S3 prefix + + Returns: + List of image file paths/URIs sorted by filename + """ + # Handle S3 URIs + if not image_path.startswith("s3://"): + raise ValueError(f"Invalid file path {image_path} - expecting S3 URI") + + # Check if it's a direct file or a prefix + if image_path.endswith( + (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp") + ): + # Direct S3 file + return [image_path] + else: + # S3 prefix - list all images + return s3.list_images_from_path(image_path) diff --git a/plugins/dynamic-few-shot-lambda/src/requirements.txt b/plugins/dynamic-few-shot-lambda/src/requirements.txt new file mode 100644 index 00000000..77b716ca --- /dev/null +++ b/plugins/dynamic-few-shot-lambda/src/requirements.txt @@ -0,0 +1 @@ +../../lib/idp_common_pkg[extraction,docs_service] # extraction module and document service with dependencies diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 2c5158da..8646df10 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -46,6 +46,14 @@ Parameters: Type: String Default: "GENAIIDP-dynamic-few-shot" + GenAIIDPS3OutputBucketName: + Type: String + Description: "GenAIIDP S3OutputBucketName" + + GenAIIDPCustomerManagedEncryptionKeyArn: + Type: String + Description: "GenAIIDP CustomerManagedEncryptionKey ARN" + Conditions: HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]] @@ -69,7 +77,7 @@ Resources: Properties: FunctionName: !Ref LambdaFunctionName PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue] - CodeUri: ./ + CodeUri: ./src Handler: GENAIIDP-dynamic-few-shot.lambda_handler Runtime: python3.12 Architectures: @@ -91,6 +99,8 @@ Resources: # Minimal permissions - only needs basic execution and logging Policies: - AWSLambdaBasicExecutionRole + - S3ReadPolicy: + BucketName: !Ref GenAIIDPS3OutputBucketName - S3ReadPolicy: BucketName: !Ref DynamicFewShotDatasetBucket - Statement: @@ -110,6 +120,15 @@ Resources: - s3vectors:QueryVectors Resource: - !Ref DynamicFewShotVectorIndex + - Effect: Allow + Action: + - kms:Encrypt + - kms:Decrypt + - kms:ReEncrypt* + - kms:GenerateDataKey* + - kms:DescribeKey + Resource: + - !Ref GenAIIDPCustomerManagedEncryptionKeyArn DynamicFewShotLogGroup: Type: AWS::Logs::LogGroup From 4cb63fce032e6b0c20bbb0a350ae33bd35b35049 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Thu, 11 Dec 2025 23:40:24 +0100 Subject: [PATCH 18/39] chore: configurable LOG_LEVEL --- plugins/dynamic-few-shot-lambda/template.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 8646df10..168ea957 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -46,6 +46,10 @@ Parameters: Type: String Default: "GENAIIDP-dynamic-few-shot" + LogLevel: + Type: String + Default: INFO + GenAIIDPS3OutputBucketName: Type: String Description: "GenAIIDP S3OutputBucketName" @@ -87,7 +91,7 @@ Resources: Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting Environment: Variables: - LOG_LEVEL: INFO + LOG_LEVEL: !Ref LogLevel S3VECTOR_BUCKET: !Ref VectorBucketName S3VECTOR_INDEX: !Ref VectorIndexName S3VECTOR_DIMENSIONS: !Ref VectorDimensions From 21c9855343cbcb932aeddab1bded2c6b140c5822 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:27:15 +0100 Subject: [PATCH 19/39] feat: convert image_uri to image bytes from custom lambda invocation --- .../idp_common/extraction/service.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py index 3cd83a9f..ead4e0a1 100644 --- a/lib/idp_common_pkg/idp_common/extraction/service.py +++ b/lib/idp_common_pkg/idp_common/extraction/service.py @@ -433,6 +433,46 @@ def _make_json_serializable(self, obj: Any) -> Any: # Convert non-serializable objects to string representation return str(obj) + def _convert_image_uris_to_bytes_in_content( + self, content: list[dict[str, Any]] + ) -> list[dict[str, Any]]: + """ + Convert image URIs back to bytes in content array after Lambda processing. + + Args: + content: Content array from Lambda that may contain image URIs + + Returns: + Content array with image bytes restored + """ + converted_content = [] + + for item in content: + if "image_uri" in item: + image_uri = item["image_uri"] + + # Load image content + if image_uri.startswith("s3://"): + # Direct S3 URI + logger.info(f"Retrieving image {image_uri}") + image_bytes = s3.get_binary_content(image_uri) + else: + raise ValueError( + f"Invalid file path {image_uri} - expecting S3 path" + ) + + converted_item = image.prepare_bedrock_image_attachment(image_bytes) + elif "image" in item: + # Keep existing image objects as-is + converted_item = item.copy() + else: + # Keep non-image items as-is + converted_item = item.copy() + + converted_content.append(converted_item) + + return converted_content + def _invoke_custom_prompt_lambda( self, lambda_arn: str, payload: dict[str, Any] ) -> dict[str, Any]: @@ -486,6 +526,13 @@ def _invoke_custom_prompt_lambda( logger.error(error_msg) raise Exception(error_msg) + # Convert image URIs to bytes in the response + result["task_prompt_content"] = ( + self._convert_image_uris_to_bytes_in_content( + result["task_prompt_content"] + ) + ) + return result except Exception as e: From f99467cc84726ae11c3f3c35b3815a3ffe1b2cda Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:28:54 +0100 Subject: [PATCH 20/39] chore: use working bucket from GenAIIDP for dataset + adapt threshold --- plugins/dynamic-few-shot-lambda/README.md | 261 +++++++++++++------ plugins/dynamic-few-shot-lambda/template.yml | 30 +-- 2 files changed, 183 insertions(+), 108 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md index a400f61a..d38e5384 100644 --- a/plugins/dynamic-few-shot-lambda/README.md +++ b/plugins/dynamic-few-shot-lambda/README.md @@ -1,64 +1,77 @@ -# Dynamic-Few Shot Prompting - Complete Guide +# Dynamic Few-Shot Prompting Lambda - Complete Guide -This directory contains the **complete implementation and demonstration** of the dynamic-few shot prompting feature for GenAI IDP Accelerator. This feature enables users to dynamically retrieve few-shot examples using S3 Vectors similarity search to improve extraction accuracy for Pattern 2. +This directory contains the **complete implementation** of the dynamic few-shot prompting Lambda function for GenAI IDP Accelerator. This Lambda function integrates with Pattern 2 extraction as a custom prompt generator, dynamically retrieving similar examples using S3 Vectors similarity search to improve extraction accuracy. ## ๐ŸŽฏ Overview -The dynamic-few shot prompting feature allows you to: +The dynamic few-shot prompting Lambda function allows you to: - **Dynamically retrieve similar examples** based on document content using vector similarity search -- **Provide few-shot examples** to improve extraction accuracy through example-based prompting +- **Automatically inject few-shot examples** into extraction prompts using the `{FEW_SHOT_EXAMPLES}` placeholder - **Leverage S3 Vectors** for efficient similarity search across large example datasets - **Integrate multimodal embeddings** using Amazon Nova models for image-based similarity -- **Customize example selection** based on document characteristics and business rules +- **Seamlessly integrate** with existing IDP extraction workflows as a custom prompt Lambda ## ๐Ÿ“ Files in This Directory -- **`GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup -- **`template.yml`** - CloudFormation SAM template to deploy the complete stack -- **`requirements.txt`** - Python dependencies for the Lambda function +- **`src/GENAIIDP-dynamic-few-shot.py`** - Dynamic few-shot Lambda function with S3 Vectors lookup +- **`src/requirements.txt`** - Python dependencies for the Lambda function +- **`template.yml`** - CloudFormation SAM template to deploy the Lambda function - **`README.md`** - This comprehensive documentation and guide ## ๐Ÿ—๏ธ Architecture ```mermaid flowchart TD - A[Document Processing] --> B{Dynamic-few shot configured?} - B -->|No| C[Use Default Extraction] - B -->|Yes| D[Invoke Dynamic-few shot Lambda] - - subgraph Lambda - D --> E[Receive Document Images] - E --> F[Generate Embeddings with Nova] - F --> G[Query S3 Vectors Index] - G --> H[Retrieve Similar Examples] - H --> I[Load Example Images from S3] - I --> J[Format Examples for Bedrock] + A[IDP Document Processing] --> B{Custom Prompt Lambda ARN configured?} + B -->|No| C[Use Default Task Prompt] + B -->|Yes| D[Invoke Dynamic Few-Shot Lambda] + + subgraph "Lambda Function: GENAIIDP-dynamic-few-shot" + D --> E[Receive IDP Context & Placeholders] + E --> F[Extract Document Images from DOCUMENT_IMAGE] + F --> G[Generate Nova Multimodal Embeddings] + G --> H[Query S3 Vectors Index] + H --> I[Filter by Distance Threshold] + I --> J[Merge & Deduplicate Results] + J --> K[Load Example Images from S3] + K --> L[Build Prompt Content Array] + L --> M[Replace FEW_SHOT_EXAMPLES Placeholder] end - J --> K[Use Examples in Extraction Prompt] - C --> L[Continue with Standard Extraction] - K --> L + M --> N[Return Modified Task Prompt Content] + C --> O[Continue with Bedrock Extraction] + N --> O - subgraph Input - M[Document Class] - N[Document Text] - O[Document Images] + subgraph "Input Payload" + P[config: IDP Configuration] + Q[prompt_placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc.] + R[default_task_prompt_content: Original prompt] + S[serialized_document: Document metadata] end - subgraph Output - P[Example Attributes Prompts] - Q[Example Images] - R[Similarity Distances] + subgraph "Output Payload" + T[system_prompt: Unchanged] + U[task_prompt_content: Array with Prompt segments and Example images] end - D -.-> M - D -.-> N - D -.-> O + D -.-> P + D -.-> Q + D -.-> R + D -.-> S - J -.-> P - J -.-> Q - J -.-> R + N -.-> T + N -.-> U + + subgraph "S3 Vectors Infrastructure" + X[Vector Bucket: Encrypted storage] + Y[Vector Index: 3072-dim cosine similarity] + Z[Metadata: classPrompt, attributesPrompt, imagePath] + end + + H -.-> X + H -.-> Y + H -.-> Z ``` ## Quick Start @@ -88,7 +101,7 @@ aws cloudformation describe-stacks \ Use the [fewshot_dataset_import.ipynb](notebooks/fewshot_dataset_import.ipynb) notebook to import a dataset into S3 Vectors, or manually upload your example documents and metadata to the S3 bucket and vector index created by the stack. -### Step 4: Configure IDP to Use Dynamic-few shot +### Step 4: Configure IDP to Use Dynamic Few-Shot Add the Lambda ARN to your IDP extraction configuration: @@ -97,42 +110,81 @@ extraction: custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" ``` +**Important**: Your extraction task prompt must include the `{FEW_SHOT_EXAMPLES}` placeholder where you want the dynamic examples to be inserted. + +### Step 5: Run the Demo Notebook + +0. Run `notebooks/examples` steps 0, 1, 2 +1. Open `plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_custom_lambda.ipynb` +2. Run all cells to see the comparison + ## Lambda Interface ### Input Payload Structure + +The Lambda receives the full IDP context as a custom prompt Lambda: + ```json { - "class_label": "invoice", - "document_text": "Text or markdown from section 1 (pages 1-3)...", - "image_content": [ - "base64_encoded_image_1", - "base64_encoded_image_2" - ] + "config": { + "extraction": {...}, + "classes": [...], + ... + }, + "prompt_placeholders": { + "DOCUMENT_TEXT": "Full OCR text from all pages", + "DOCUMENT_CLASS": "invoice", + "ATTRIBUTE_NAMES_AND_DESCRIPTIONS": "LineItems: List of line items in the invoice...", + "DOCUMENT_IMAGE": ["s3://bucket/document/page1.jpg", "s3://bucket/document/page2.jpg"] + }, + "default_task_prompt_content": [ + {"text": "Resolved default task prompt..."}, + {"image_uri": "s3://..."}, // if images present + {"cachePoint": true} // if cache points present + ], + "serialized_document": { + "id": "document-123", + "input_bucket": "my-bucket", + "pages": {...}, + "sections": [...], + ... + } } ``` ### Output Payload Structure + +The Lambda returns modified prompt content with dynamic few-shot examples: + ```json -[ - { - "attributes_prompt": "Expected attributes are: invoice_number [Unique identifier], invoice_date [Invoice date], total_amount [Total amount]...", - "class_prompt": "This is an example of the class 'invoice'", - "distance": 0.122344521145, # lower is more similar - "image_content": ["", "", ...] - } -] +{ + "system_prompt": "Custom system prompt text", + "task_prompt_content": [ + {"text": "Extract the following attributes from this invoice document:\n\nLineItems: List of line items in the invoice...\n\n"}, + {"text": "expected attributes are:\n \"invoice_number\": \"INV-2024-001\",\n \"total_amount\": \"$1,250.00\""}, + {"image_uri": "s3://examples-bucket/invoices/example-001/page1.jpg"}, + {"text": "\n\n<>\n\nDocument content:\nINVOICE\nInvoice #: INV-2024-002..."} + ] +} ``` ## Core Functionality -### 1. Vector Similarity Search +### 1. Custom Prompt Integration + +The Lambda integrates with IDP's custom prompt system by: +- Receiving the full extraction context and configuration +- Processing the `{FEW_SHOT_EXAMPLES}` placeholder in task prompts +- Returning modified prompt content with dynamically retrieved examples + +### 2. Vector Similarity Search The Lambda uses Amazon Nova multimodal embeddings to find similar examples: ```python # Generate embedding from document image -embedding = bedrock.generate_embedding( - image_source=image_data, +embedding = bedrock_client.generate_embedding( + image_source=page_image, model_id=MODEL_ID, dimensions=S3VECTOR_DIMENSIONS, ) @@ -148,34 +200,36 @@ response = s3vectors.query_vectors( ) ``` -### 2. Example Merging and Deduplication +### 3. Example Merging and Deduplication Multiple document images are processed and results are merged to avoid duplicates: ```python -def merge_examples(combined_examples, new_examples): +def _merge_examples(examples, new_examples): """Merge examples, keeping the best similarity score for duplicates""" for new_example in new_examples: key = new_example["key"] - if combined_examples.get(key): - # Keep the better (lower) distance score - combined_examples[key]["distance"] = min( - new_example.get("distance"), - combined_examples[key]["distance"] - ) + new_distance = new_example.get("distance", 1.0) + + if examples.get(key): + existing_distance = examples[key].get("distance", 1.0) + examples[key]["distance"] = min(new_distance, existing_distance) ``` -### 3. Example Image Loading +### 4. Prompt Content Building -The Lambda loads example images from S3 paths stored in vector metadata: +The Lambda builds structured prompt content handling multiple placeholders: ```python -def get_image_files_from_s3_path(image_path: str) -> List[str]: - """Get list of image files from S3 path or prefix""" - if image_path.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")): - return [image_path] # Direct file - else: - return s3.list_images_from_path(image_path) # Directory/prefix +def _build_prompt_content(prompt_template, substitutions, image_content): + """ + Build prompt content array handling FEW_SHOT_EXAMPLES and DOCUMENT_IMAGE placeholders. + + Handles: + - {FEW_SHOT_EXAMPLES}: Inserts few-shot examples from S3 Vectors + - {DOCUMENT_IMAGE}: Inserts images at specific location + - Regular text placeholders: DOCUMENT_TEXT, DOCUMENT_CLASS, etc. + """ ``` ## Configuration @@ -188,7 +242,9 @@ The Lambda function uses these environment variables (set by the CloudFormation - `S3VECTOR_INDEX` - Name of the S3 Vectors index - `S3VECTOR_DIMENSIONS` - Embedding dimensions (e.g. `3072` for Nova Multimodal Embedding model) - `MODEL_ID` - Bedrock model ID for embeddings (e.g. `amazon.nova-2-multimodal-embeddings-v1:0`) -- `TOP_K` - Number of similar examples to retrieve +- `TOP_K` - Number of similar examples to retrieve (default: 3) +- `THRESHOLD` - Maximum distance threshold for filtering results (default: 0.5) +- `LOG_LEVEL` - Logging level (default: INFO) ### S3 Vectors Configuration @@ -208,16 +264,22 @@ Monitor the Lambda function logs: **Successful Operation:** ``` -Processing document ID: document-123 -Document class: invoice -Response contains 2 elements +=== DYNAMIC FEW-SHOT LAMBDA INVOKED === +=== EXTRACTION CONFIG === +Model: anthropic.claude-3-5-sonnet-20241022-v2:0 +=== HANDLE INPUT DOCUMENT === +=== OUTPUT ANALYSIS === +Output keys: ['system_prompt', 'task_prompt_content'] +Task prompt content items: 5 +=== DYNAMIC FEW-SHOT LAMBDA COMPLETED === ``` **Error Conditions:** ``` -No class_label found in event -No document_texts found in event or not in list format -Failed to load example images from s3://bucket/path: error +Failed to parse environment variables: KeyError('S3VECTOR_BUCKET') +Skipping example with empty attributesPrompt: example_key +Skipping example with distance 0.8 above threshold 0.5: example_key +Invalid file path /local/path - expecting S3 URI ``` ### Performance Monitoring @@ -331,22 +393,53 @@ aws cloudformation delete-stack --stack-name GENAIIDP-dynamic-few-shot-stack ### Configuration in IDP Stack -Add the dynamic-few shot Lambda ARN to your IDP configuration: +Add the dynamic few-shot Lambda ARN to your IDP extraction configuration: ```yaml -# In your IDP stack parameters or configuration extraction: - dynamic_few_shot_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" + custom_prompt_lambda_arn: "arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot" ``` +### Required Task Prompt Configuration + +**Critical**: Your extraction task prompt must include the `{FEW_SHOT_EXAMPLES}` placeholder where you want the dynamic examples to be inserted. The Lambda specifically looks for this placeholder and replaces it with retrieved examples. + ### Expected Behavior When configured: 1. IDP processes document and extracts images/text -2. Dynamic few-shot Lambda is invoked with document data -3. Lambda returns similar examples with prompts and images -4. IDP includes examples in extraction prompt to Bedrock -5. Bedrock uses examples to improve extraction accuracy +2. IDP invokes the dynamic few-shot Lambda with full extraction context +3. Lambda generates embeddings from document images using Amazon Nova +4. Lambda queries S3 Vectors to find similar examples +5. Lambda loads example images and metadata from S3 +6. Lambda builds modified prompt content with examples inserted at `{FEW_SHOT_EXAMPLES}` location +7. IDP uses the modified prompt content for Bedrock extraction +8. Bedrock uses the dynamic examples to improve extraction accuracy + +### Prompt Flow Example + +**Original Task Prompt:** +``` +Extract attributes from this invoice: +{ATTRIBUTE_NAMES_AND_DESCRIPTIONS} +{FEW_SHOT_EXAMPLES} +<> +Document: {DOCUMENT_TEXT} +``` + +**After Lambda Processing:** +``` +Extract attributes from this invoice: +invoice_number [Unique identifier]... + +expected attributes are: + "invoice_number": "INV-2024-001", + "total_amount": "$1,250.00" +[Example image content] + +<> +Document: INVOICE #INV-2024-002... +``` ## Next Steps diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 168ea957..71f47f81 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -39,7 +39,7 @@ Parameters: Threshold: Type: Number - Default: 0.2 + Default: 0.5 Description: Filter results exceeding this similarity threshold (lower is more similar) LambdaFunctionName: @@ -54,6 +54,10 @@ Parameters: Type: String Description: "GenAIIDP S3OutputBucketName" + GenAIIDPS3WorkingBucketName: + Type: String + Description: "GenAIIDP WorkingBucket Name" + GenAIIDPCustomerManagedEncryptionKeyArn: Type: String Description: "GenAIIDP CustomerManagedEncryptionKey ARN" @@ -106,7 +110,7 @@ Resources: - S3ReadPolicy: BucketName: !Ref GenAIIDPS3OutputBucketName - S3ReadPolicy: - BucketName: !Ref DynamicFewShotDatasetBucket + BucketName: !Ref GenAIIDPS3WorkingBucketName - Statement: - Effect: Allow Action: cloudwatch:PutMetricData @@ -173,28 +177,6 @@ Resources: - "imagePath" VectorBucketArn: !Ref DynamicFewShotVectorBucket - DynamicFewShotDatasetBucket: - Type: AWS::S3::Bucket - DeletionPolicy: RetainExceptOnCreate - Metadata: - cfn_nag: - rules_to_suppress: - - id: W84 - reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" - # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" - Properties: - BucketEncryption: - ServerSideEncryptionConfiguration: - - ServerSideEncryptionByDefault: - SSEAlgorithm: "AES256" - PublicAccessBlockConfiguration: - BlockPublicAcls: true - BlockPublicPolicy: true - IgnorePublicAcls: true - RestrictPublicBuckets: true - VersioningConfiguration: - Status: Enabled - Outputs: DynamicFewShotFunctionName: From 72c85f7107ebf610aa365b25a6f2a7250fb7a56a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:29:16 +0100 Subject: [PATCH 21/39] chore: remove FATURA2 dataset import --- .../notebooks/fewshot_dataset_import.ipynb | 487 ------------------ 1 file changed, 487 deletions(-) delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb diff --git a/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb deleted file mode 100644 index dc56a646..00000000 --- a/plugins/dynamic-few-shot-lambda/notebooks/fewshot_dataset_import.ipynb +++ /dev/null @@ -1,487 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Few-shot Dataset Import to S3 Vector store\n", - "\n", - "This notebook demonstrates how to import the FATURA2 dataset into S3 Vectors for use with the examples-provider Lambda function.\n", - "\n", - "The FATURA2 dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n", - "\n", - "## Process Overview:\n", - "\n", - "1. **Load FATURA2 Dataset** - Download and process the dataset\n", - "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n", - "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n", - "4. **Verify Import** - Test similarity search functionality\n", - "\n", - "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Install Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Let's make sure that modules are autoreloaded\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "ROOTDIR=\"../..\"\n", - "# First uninstall existing package (to ensure we get the latest version)\n", - "%pip uninstall -y idp_common\n", - "\n", - "# Install the IDP common package with all components in development mode\n", - "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n", - "\n", - "# Note: We can also install specific components like:\n", - "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n", - "\n", - "# Check installed version\n", - "%pip show idp_common | grep -E \"Version|Location\"\n", - "\n", - "# Install required packages\n", - "%pip install -q pillow requests tqdm pandas\n", - "\n", - "# Optionally use a .env file for environment variables\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv() \n", - "except ImportError:\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import zipfile\n", - "import requests\n", - "from pathlib import Path\n", - "from typing import Dict, List, Any\n", - "from tqdm import tqdm\n", - "import pandas as pd\n", - "\n", - "import boto3\n", - "from PIL import Image\n", - "\n", - "# Import IDP common modules\n", - "from idp_common import bedrock\n", - "\n", - "print(\"Libraries imported successfully\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Configure S3 Vectors and Bedrock" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Configuration - Update these values based on your deployment of the 'notebooks/examples/dynamic-few-shot-lambda' stack\n", - "S3_BUCKET_FOR_IMAGES = \"\" # Stack output 'DynamicFewShotDatasetBucket'\n", - "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n", - "S3_VECTORS_INDEX = \"documents\"\n", - "\n", - "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n", - "EMBEDDING_DIMENSIONS = 3072\n", - "\n", - "# Initialize clients\n", - "s3vectors_client = boto3.client('s3vectors')\n", - "s3_client = boto3.client('s3')\n", - "bedrock_client = bedrock.BedrockClient()\n", - "\n", - "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n", - "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n", - "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Load FATURA2 Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download and extract FATURA2 dataset from Zenodo\n", - "print(\"Downloading FATURA2 dataset...\")\n", - "\n", - "# Configuration for this dataset\n", - "IMAGE_VARIANT = 'colored_images'\n", - "ANNOTATION_VARIANT = 'Original_Format'\n", - "CLASS_LABEL = 'invoice'\n", - "\n", - "# Create datasets directory\n", - "datasets_dir = Path('datasets')\n", - "datasets_dir.mkdir(exist_ok=True)\n", - "\n", - "# Download the zip file\n", - "zip_url = 'https://zenodo.org/records/10371464/files/FATURA2.zip?download=1'\n", - "zip_path = datasets_dir / 'FATURA2.zip'\n", - "\n", - "if not zip_path.exists():\n", - " response = requests.get(zip_url, stream=True)\n", - " response.raise_for_status()\n", - " \n", - " with open(zip_path, 'wb') as f:\n", - " for chunk in tqdm(response.iter_content(chunk_size=8192), desc='Downloading'):\n", - " f.write(chunk)\n", - " print(f\"Downloaded {zip_path}\")\n", - "else:\n", - " print(f\"Using existing {zip_path}\")\n", - "\n", - "# Extract the zip file\n", - "extract_dir = datasets_dir / 'invoices_dataset_final'\n", - "if not extract_dir.exists():\n", - " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", - " zip_ref.extractall(datasets_dir)\n", - " print(f\"Extracted to {extract_dir}\")\n", - "else:\n", - " print(f\"Using existing {extract_dir}\")\n", - "\n", - "colored_images = extract_dir / IMAGE_VARIANT\n", - "\n", - "# Load images from extracted directory\n", - "image_files = list(colored_images.glob('**/*.jpg'))\n", - "print(f\"Found {len(image_files)} {IMAGE_VARIANT} files\")\n", - "\n", - "# Show sample\n", - "if image_files:\n", - " sample_image = Image.open(image_files[0])\n", - " print(f\"Sample image: {image_files[0].name}\")\n", - " print(f\"Image size: {sample_image.size}\")\n", - "\n", - "print(f\"Image variant: {IMAGE_VARIANT}\")\n", - "print(f\"Annotation variant: {ANNOTATION_VARIANT}\")\n", - "print(f\"Class label: {CLASS_LABEL}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Process Dataset and Generate Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n", - " \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n", - " s3_client.put_object(\n", - " Bucket=S3_BUCKET_FOR_IMAGES,\n", - " Key=s3_key,\n", - " Body=image_bytes,\n", - " ContentType='image/jpeg'\n", - " )\n", - " return f\"s3://{S3_BUCKET_FOR_IMAGES}/{s3_key}\"\n", - "\n", - "def load_split(extract_dir, split_name):\n", - " csv_path = extract_dir / (split_name + \".csv\")\n", - " return pd.read_csv(csv_path)\n", - "\n", - "def read_annotation(extract_dir, annot_path):\n", - " json_path = extract_dir / \"Annotations\" / ANNOTATION_VARIANT / annot_path\n", - " with open(json_path, \"r\") as f:\n", - " annotation = f.read()\n", - " return json.loads(annotation)\n", - "\n", - "def load_image(extract_dir, img_path):\n", - " image_path = extract_dir / IMAGE_VARIANT / img_path\n", - " with open(image_path, \"rb\") as f:\n", - " image_content = f.read()\n", - " return image_content\n", - "\n", - "def map_labels(annotations):\n", - " labels = {}\n", - " labels['invoice_number'] = annotations.get(\"NUMBER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['invoice_date'] = annotations.get(\"DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['due_date'] = annotations.get(\"DUE_DATE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['vendor_name'] = annotations.get(\"SELLER_NAME\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['vendor_address'] = annotations.get(\"SELLER_ADDRESS\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " BUYER = annotations.get(\"BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['customer_name'] = BUYER[0] if len(BUYER) > 0 else []\n", - " labels['customer_address'] = BUYER[1:] if len(BUYER) > 1 else []\n", - " labels['items'] = \"null\"\n", - " labels['quantities'] = \"null\"\n", - " labels['unit_prices'] = \"null\"\n", - " labels['subtotal'] = annotations.get(\"SUB_TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['tax'] = annotations.get(\"TAX\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['total_amount'] = annotations.get(\"TOTAL\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['payment_terms'] = annotations.get(\"NOTE\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " labels['po_number'] = annotations.get(\"GSTIN_BUYER\", {}).get(\"text\", \"null\").split(\"\\n\")\n", - " return labels\n", - "\n", - "def get_attributes_prompt(labels):\n", - " attributes_prompt = f\"\"\"expected attributes are:\n", - " \"invoice_number\": {\", \".join(labels['invoice_number'])}\n", - " \"invoice_date\": {\", \".join(labels['invoice_date'])}\n", - " \"due_date\": {\", \".join(labels['due_date'])}\n", - " \"vendor_name\": {\", \".join(labels['vendor_name'])}\n", - " \"vendor_address\": {\", \".join(labels['vendor_address'])}\n", - " \"customer_name\": {labels['customer_name']}\n", - " \"customer_address\": {\", \".join(labels['customer_address'])}\n", - " \"items\": {labels['items']}\n", - " \"quantities\": {labels['quantities']}\n", - " \"unit_prices\": {labels['unit_prices']}\n", - " \"subtotal\": {\", \".join(labels['subtotal'])}\n", - " \"tax\": {\", \".join(labels['tax'])}\n", - " \"total_amount\": {\", \".join(labels['total_amount'])}\n", - " \"payment_terms\": {\", \".join(labels['payment_terms'])}\n", - " \"po_number\": {\", \".join(labels['po_number'])}\n", - " \"\"\".strip()\n", - " return attributes_prompt\n", - "\n", - "def create_metadata(annotations: Dict, s3_image_uri: str) -> Dict:\n", - " \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n", - " class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n", - "\n", - " labels = map_labels(annotations)\n", - " attributes_prompt = get_attributes_prompt(labels)\n", - "\n", - " return {\n", - " \"classLabel\": CLASS_LABEL,\n", - " \"classPrompt\": class_prompt,\n", - " \"attributesPrompt\": attributes_prompt,\n", - " \"imagePath\": s3_image_uri,\n", - " }\n", - "\n", - "print(\"Helper functions defined\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Import Dataset to S3 Vectors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Process a subset of the dataset (adjust as needed)\n", - "MAX_SAMPLES = 100 # Adjust this number based on your needs\n", - "BATCH_SIZE = 10 # Adjust this number based on your needs\n", - "\n", - "dataset_split = load_split(extract_dir, \"strat1_train\")\n", - "samples_to_process = min(MAX_SAMPLES, len(dataset_split))\n", - "\n", - "print(f\"Processing {samples_to_process} samples from FATURA2 dataset...\")\n", - "\n", - "vectors_to_upload = []\n", - "failed_samples = []\n", - "\n", - "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n", - " try:\n", - " df_image = dataset_split.iloc[i]\n", - "\n", - " # Load annotations\n", - " annotations = read_annotation(extract_dir, df_image[\"annot_path\"])\n", - " \n", - " # Load image\n", - " image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n", - "\n", - " # Upload image to S3\n", - " s3_key = f\"fatura2/{IMAGE_VARIANT}/{df_image['img_path']}\"\n", - " s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n", - " \n", - " # Generate embedding\n", - " embedding = bedrock_client.generate_embedding(\n", - " image_source=image_bytes,\n", - " model_id=EMBEDDING_MODEL_ID,\n", - " dimensions=EMBEDDING_DIMENSIONS\n", - " )\n", - " \n", - " # Create metadata\n", - " metadata = create_metadata(annotations, s3_image_uri)\n", - "\n", - " # Prepare vector for upload\n", - " vector_entry = {\n", - " \"key\": f\"fatura2_sample_{i:06d}\",\n", - " \"data\": {\"float32\": embedding},\n", - " \"metadata\": metadata\n", - " }\n", - "\n", - " vectors_to_upload.append(vector_entry)\n", - " \n", - " # Upload in batches to avoid memory issues\n", - " if len(vectors_to_upload) >= BATCH_SIZE: # Batch size\n", - " print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n", - " response = s3vectors_client.put_vectors(\n", - " vectorBucketName=S3_VECTORS_BUCKET,\n", - " indexName=S3_VECTORS_INDEX,\n", - " vectors=vectors_to_upload\n", - " )\n", - " print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", - " vectors_to_upload = [] # Clear batch\n", - " \n", - " except Exception as e:\n", - " print(f\"\\nFailed to process sample {i}: {e}\")\n", - " failed_samples.append(i)\n", - " continue\n", - "\n", - "# Upload remaining vectors\n", - "if vectors_to_upload:\n", - " print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n", - " response = s3vectors_client.put_vectors(\n", - " vectorBucketName=S3_VECTORS_BUCKET,\n", - " indexName=S3_VECTORS_INDEX,\n", - " vectors=vectors_to_upload\n", - " )\n", - " print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", - "\n", - "print(f\"\\nImport completed!\")\n", - "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples\")\n", - "print(f\"Failed samples: {len(failed_samples)}\")\n", - "if failed_samples:\n", - " print(f\"Failed sample indices: {failed_samples[:10]}...\") # Show first 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Verify Import with Similarity Search" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Test similarity search with a sample from the dataset\n", - "test_split = load_split(extract_dir, \"strat1_test\")\n", - "\n", - "test_sample_index = 0\n", - "df_image = test_split.iloc[test_sample_index]\n", - "\n", - "test_image_bytes = load_image(extract_dir, df_image[\"img_path\"])\n", - "\n", - "print(f\"Testing similarity search with sample {extract_dir / IMAGE_VARIANT / df_image['img_path']}...\")\n", - "\n", - "# Generate embedding for test image\n", - "test_embedding = bedrock_client.generate_embedding(\n", - " image_source=test_image_bytes,\n", - " model_id=EMBEDDING_MODEL_ID,\n", - " dimensions=EMBEDDING_DIMENSIONS\n", - ")\n", - "\n", - "# Query S3 Vectors for similar examples\n", - "response = s3vectors_client.query_vectors(\n", - " vectorBucketName=S3_VECTORS_BUCKET,\n", - " indexName=S3_VECTORS_INDEX,\n", - " queryVector={\"float32\": test_embedding},\n", - " topK=5,\n", - " returnDistance=True,\n", - " returnMetadata=True\n", - ")\n", - "\n", - "print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n", - "for i, vector in enumerate(response['vectors']):\n", - " distance = vector.get('distance', 'N/A')\n", - " key = vector.get('key', 'N/A')\n", - " metadata = vector.get('metadata', {})\n", - " class_label = metadata.get('classLabel', 'N/A')\n", - " class_prompt = metadata.get('classPrompt', 'N/A')\n", - " attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n", - " image_path = metadata.get('imagePath', 'N/A')\n", - " \n", - " print(f\" {i+1}. Key: {key}\")\n", - " print(f\" Distance: {distance:.4f}\")\n", - " print(f\" Class Label: {image_path}\")\n", - " print(f\" Class Prompt: {class_prompt}\")\n", - " print(f\" Attributes Prompt: {attributes_prompt}\")\n", - " print(f\" Image Path: {image_path}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. Summary and Next Steps" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"=== Few-shot Dataset Import Summary ===\")\n", - "print(f\"โœ… Dataset: FATURA2 (Invoice documents)\")\n", - "print(f\"โœ… Samples processed: {samples_to_process - len(failed_samples)}\")\n", - "print(f\"โœ… S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n", - "print(f\"โœ… S3 Vectors Index: {S3_VECTORS_INDEX}\")\n", - "print(f\"โœ… Images stored in: s3://{S3_BUCKET_FOR_IMAGES}/fatura2/{IMAGE_VARIANT}/\")\n", - "print(f\"โœ… Embedding Model: {EMBEDDING_MODEL_ID}\")\n", - "print(f\"โœ… Similarity search verified\")\n", - "\n", - "print(\"\\n=== Next Steps ===\")\n", - "print(\"1. Upload your own datasets into S3 Vectors\")\n", - "print(\"2. Configure your IDP extraction to use the examples provider Lambda ARN\")\n", - "print(\"3. Test document processing with few-shot examples!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 3c50242b5dd340f0c66085c5509324755535e056 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:30:14 +0100 Subject: [PATCH 22/39] feat: add fcc_invoices (REALKIE) dataset import --- .../fcc_invoices_dataset_import.ipynb | 761 ++++++++++++++++++ 1 file changed, 761 insertions(+) create mode 100644 plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb diff --git a/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb new file mode 100644 index 00000000..2dc1fdce --- /dev/null +++ b/plugins/dynamic-few-shot-lambda/notebooks/fcc_invoices_dataset_import.ipynb @@ -0,0 +1,761 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FCC Invoices Dataset Import to S3 Vector store\n", + "\n", + "This notebook demonstrates how to import the FCC invoices (REALKIE) dataset into S3 Vectors for use with the dynamic few-shot Lambda function.\n", + "\n", + "The FCC invoices dataset contains invoice documents that can be used as few-shot examples for document extraction tasks.\n", + "\n", + "## Process Overview:\n", + "\n", + "1. **Load FCC Invoices Dataset** - Sync and load the dataset using load_dataset()\n", + "2. **Generate Embeddings** - Create multimodal embeddings using Amazon Nova\n", + "3. **Upload to S3 Vectors** - Store embeddings and metadata in S3 Vectors index\n", + "4. **Verify Import** - Test similarity search functionality\n", + "\n", + "> **Note**: This notebook requires AWS credentials with permissions for Bedrock, S3, and S3 Vectors services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's make sure that modules are autoreloaded\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "ROOTDIR=\"../../../\"\n", + "# First uninstall existing package (to ensure we get the latest version)\n", + "%pip uninstall -y idp_common\n", + "\n", + "# Install the IDP common package with all components in development mode\n", + "%pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[dev, all]\"\n", + "\n", + "# Note: We can also install specific components like:\n", + "# %pip install -q -e \"{ROOTDIR}/lib/idp_common_pkg[ocr,classification,extraction,evaluation]\"\n", + "\n", + "# Check installed version\n", + "%pip show idp_common | grep -E \"Version|Location\"\n", + "\n", + "# Install required packages\n", + "%pip install -q pillow tqdm pandas datasets matplotlib\n", + "\n", + "# Optionally use a .env file fxor environment variables\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv() \n", + "except ImportError:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import subprocess\n", + "from pathlib import Path\n", + "from typing import Dict, List, Any\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import io\n", + "\n", + "import boto3\n", + "from datasets import load_dataset\n", + "\n", + "# Import IDP common modules\n", + "from idp_common import bedrock\n", + "\n", + "print(\"Libraries imported successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Configure S3 Vectors and Bedrock" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - Update these values from the IDP stack in CloudFormation Resources tab\n", + "GENAIIDP_S3_WORKING_BUCKET = \"\" # From IDP stack Resources tab\n", + "\n", + "S3_VECTORS_BUCKET = \"genaiidp-dynamic-few-shot\"\n", + "S3_VECTORS_INDEX = \"documents\"\n", + "EMBEDDING_MODEL_ID = \"amazon.nova-2-multimodal-embeddings-v1:0\"\n", + "EMBEDDING_DIMENSIONS = 3072\n", + "\n", + "# Initialize clients\n", + "s3vectors_client = boto3.client('s3vectors')\n", + "s3_client = boto3.client('s3')\n", + "bedrock_client = bedrock.BedrockClient()\n", + "\n", + "print(f\"Configured for dataset S3 Bucket: {GENAIIDP_S3_WORKING_BUCKET}\")\n", + "print(f\"Configured for S3 Vectors bucket: {S3_VECTORS_BUCKET}\")\n", + "print(f\"Configured for S3 Vectors index: {S3_VECTORS_INDEX}\")\n", + "print(f\"Using embedding model: {EMBEDDING_MODEL_ID}\")\n", + "print(f\"Using embedding dimensions: {EMBEDDING_DIMENSIONS}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load FCC Invoices Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sync FCC invoices dataset from S3\n", + "print(\"Syncing FCC invoices dataset from S3...\")\n", + "\n", + "# Configuration for this dataset\n", + "CLASS_LABEL = 'Invoice'\n", + "\n", + "# Create datasets directory\n", + "dataset_root_dir = Path('../datasets')\n", + "dataset_root_dir.mkdir(exist_ok=True)\n", + "\n", + "# Dataset directory\n", + "dataset_dir = dataset_root_dir / 'fcc_invoices'\n", + "\n", + "# Sync dataset from S3 using AWS CLI with Wasabi endpoint\n", + "if not dataset_dir.exists() or not any(dataset_dir.iterdir()):\n", + " print(\"Syncing dataset from S3...\")\n", + " sync_command = [\n", + " 'aws', 's3', 'sync',\n", + " 's3://project-fruitfly/fcc_invoices',\n", + " str(dataset_dir),\n", + " '--endpoint-url=https://s3.us-east-2.wasabisys.com',\n", + " '--no-sign-request'\n", + " ]\n", + " \n", + " try:\n", + " result = subprocess.run(sync_command, capture_output=True, text=True, check=True)\n", + " print(f\"Dataset synced successfully to {dataset_dir}\")\n", + " print(f\"Sync output: {result.stdout}\")\n", + " except subprocess.CalledProcessError as e:\n", + " print(f\"Error syncing dataset: {e}\")\n", + " print(f\"Error output: {e.stderr}\")\n", + " raise\n", + "else:\n", + " print(f\"Using existing dataset at {dataset_dir}\")\n", + "\n", + "# Load the training dataset using load_dataset\n", + "print(\"Loading training dataset...\")\n", + "try:\n", + " # Load dataset from local directory\n", + " dataset = load_dataset('csv', data_dir=str(dataset_dir), split='train')\n", + " print(f\"Loaded dataset with {len(dataset)} samples\")\n", + " \n", + " # Show sample information\n", + " if len(dataset) > 0:\n", + " sample = dataset[0]\n", + " print(f\"Sample keys: {list(sample.keys())}\")\n", + " if 'image' in sample:\n", + " print(f\"Sample image size: {sample['image'].size}\")\n", + " \n", + "except Exception as e:\n", + " print(f\"Error loading dataset: {e}\")\n", + " # Fallback: list files in directory\n", + " image_files = list(dataset_dir.glob('**/*.jpg')) + list(dataset_dir.glob('**/*.png'))\n", + " print(f\"Found {len(image_files)} image files in directory\")\n", + " if image_files:\n", + " print(f\"Sample image: {image_files[0].name}\")\n", + " print(f\"Image file size: {image_files[0].stat().st_size} bytes\")\n", + "\n", + "print(f\"Class label: {CLASS_LABEL}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Process Dataset and Generate Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def upload_image_to_s3(image_bytes: bytes, s3_key: str) -> str:\n", + " \"\"\"Upload image to S3 and return S3 URI.\"\"\"\n", + " s3_client.put_object(\n", + " Bucket=GENAIIDP_S3_WORKING_BUCKET,\n", + " Key=s3_key,\n", + " Body=image_bytes,\n", + " ContentType='image/jpeg'\n", + " )\n", + " return f\"s3://{GENAIIDP_S3_WORKING_BUCKET}/{s3_key}\"\n", + "\n", + "def load_csv_labels():\n", + " \"\"\"Load the CSV file with labels and metadata.\"\"\"\n", + " csv_path = dataset_dir / 'train.csv'\n", + " if csv_path.exists():\n", + " try:\n", + " df = pd.read_csv(csv_path)\n", + " print(f\"Loaded CSV with {len(df)} rows\")\n", + " return df\n", + " except Exception as e:\n", + " print(f\"Error loading CSV: {e}\")\n", + " return None\n", + " else:\n", + " print(f\"CSV file not found at {csv_path}\")\n", + " return None\n", + "\n", + "def match_image_to_csv_row(image_path: str, csv_df: pd.DataFrame):\n", + " \"\"\"Match an image path to the corresponding CSV row.\"\"\"\n", + " if csv_df is None:\n", + " return None\n", + " \n", + " # Extract the image filename from the path\n", + " image_name = Path(image_path).name\n", + " \n", + " # Look for matching rows in the CSV\n", + " for idx, row in csv_df.iterrows():\n", + " image_files_str = row.get('image_files', '')\n", + " if image_name in image_files_str:\n", + " return row\n", + " \n", + " return None\n", + "\n", + "def get_image_bytes_from_file(image_path):\n", + " \"\"\"Read image file directly as bytes.\"\"\"\n", + " with open(image_path, 'rb') as f:\n", + " return f.read()\n", + "\n", + "def create_sample_attributes_prompt() -> str:\n", + " \"\"\"Create a sample attributes prompt for FCC invoices based on the actual schema.\"\"\"\n", + " # Updated to match the actual FCC invoices dataset structure and expected JSON schema\n", + " attributes_prompt = \"\"\"expected attributes are:\n", + " \"Agency\": \"Great American Media\",\n", + " \"Advertiser\": \"ISS/HOUSE MAJ PAC\", \n", + " \"GrossTotal\": 94700.00,\n", + " \"PaymentTerms\": \"Cash In Advance\",\n", + " \"AgencyCommission\": 14205.00,\n", + " \"NetAmountDue\": 80495.00,\n", + " \"LineItems\": [\n", + " {\n", + " \"LineItemDescription\": \"TODAY IN FLORIDA @9PM\",\n", + " \"LineItemStartDate\": \"10/18/2016\", \n", + " \"LineItemEndDate\": null,\n", + " \"LineItemDays\": [\"T\"],\n", + " \"LineItemRate\": 500.00\n", + " },\n", + " {\n", + " \"LineItemDescription\": \"CH 7 NEWS @ 10PM\",\n", + " \"LineItemStartDate\": \"10/18/2016\",\n", + " \"LineItemEndDate\": null, \n", + " \"LineItemDays\": [\"T\"],\n", + " \"LineItemRate\": 3200.00\n", + " }\n", + " ]\n", + " \"\"\".strip()\n", + " return attributes_prompt\n", + "\n", + "def parse_ground_truth_labels(labels_json_str: str) -> Dict:\n", + " \"\"\"Parse ground truth labels from the dataset and convert to expected format.\"\"\"\n", + " import json\n", + " \n", + " try:\n", + " labels = json.loads(labels_json_str)\n", + " except (json.JSONDecodeError, TypeError):\n", + " return None\n", + " \n", + " # Initialize the result structure\n", + " result = {\n", + " \"Agency\": None,\n", + " \"Advertiser\": None,\n", + " \"GrossTotal\": None,\n", + " \"PaymentTerms\": None,\n", + " \"AgencyCommission\": None,\n", + " \"NetAmountDue\": None,\n", + " \"LineItems\": []\n", + " }\n", + " \n", + " # Group line items by their properties\n", + " line_items = {}\n", + " \n", + " for label in labels:\n", + " label_type = label.get('label', '')\n", + " text = label.get('text', '')\n", + " \n", + " # Map top-level fields\n", + " if label_type == 'Agency':\n", + " result['Agency'] = text\n", + " elif label_type == 'Advertiser':\n", + " result['Advertiser'] = text\n", + " elif label_type == 'Gross Total':\n", + " try:\n", + " result['GrossTotal'] = float(text.replace(',', '').replace('$', ''))\n", + " except ValueError:\n", + " result['GrossTotal'] = text\n", + " elif label_type == 'Net Amount Due':\n", + " try:\n", + " result['NetAmountDue'] = float(text.replace(',', '').replace('$', ''))\n", + " except ValueError:\n", + " result['NetAmountDue'] = text\n", + " elif label_type == 'Payment Terms':\n", + " result['PaymentTerms'] = text\n", + " elif label_type == 'Agency Commission':\n", + " try:\n", + " result['AgencyCommission'] = float(text.replace(',', '').replace('$', ''))\n", + " except ValueError:\n", + " result['AgencyCommission'] = text\n", + " \n", + " # Handle line items (group by position or create separate items)\n", + " elif label_type.startswith('Line Item - '):\n", + " field_name = label_type.replace('Line Item - ', '')\n", + " start_pos = label.get('start', 0)\n", + " \n", + " # Use start position to group related line item fields\n", + " # Find the closest line item group\n", + " closest_key = None\n", + " min_distance = float('inf')\n", + " \n", + " for key in line_items.keys():\n", + " distance = abs(start_pos - key)\n", + " if distance < min_distance and distance < 1000: # Within reasonable range\n", + " min_distance = distance\n", + " closest_key = key\n", + " \n", + " if closest_key is None:\n", + " closest_key = start_pos\n", + " line_items[closest_key] = {}\n", + " \n", + " # Map field names to expected schema\n", + " if field_name == 'Description':\n", + " line_items[closest_key]['LineItemDescription'] = text\n", + " elif field_name == 'Start Date':\n", + " line_items[closest_key]['LineItemStartDate'] = text\n", + " elif field_name == 'End Date':\n", + " line_items[closest_key]['LineItemEndDate'] = text if text else None\n", + " elif field_name == 'Rate':\n", + " try:\n", + " line_items[closest_key]['LineItemRate'] = float(text.replace(',', '').replace('$', ''))\n", + " except ValueError:\n", + " line_items[closest_key]['LineItemRate'] = text\n", + " elif field_name == 'Days':\n", + " # Convert day codes to day names\n", + " day_mapping = {\n", + " 'M': 'M', 'T': 'T', 'W': 'W', 'Th': 'Th', 'F': 'F', 'S': 'S', 'Su': 'Su',\n", + " '1': 'M', '2': 'T', '3': 'W', '4': 'Th', '5': 'F', '6': 'S', '7': 'Su'\n", + " }\n", + " days = []\n", + " for char in text:\n", + " if char in day_mapping and char != '-':\n", + " mapped_day = day_mapping[char]\n", + " if mapped_day not in days:\n", + " days.append(mapped_day)\n", + " line_items[closest_key]['LineItemDays'] = days\n", + " \n", + " # Convert line items dict to list\n", + " result['LineItems'] = list(line_items.values())\n", + " \n", + " return result\n", + "\n", + "def create_metadata(s3_image_uri: str, sample_data: Dict = None) -> Dict:\n", + " \"\"\"Create metadata for S3 Vectors entry.\"\"\"\n", + " class_prompt = f\"This is an example of the class '{CLASS_LABEL}'\"\n", + " \n", + " # If we have actual sample data with labels, use it to create a more accurate attributes prompt\n", + " if sample_data and 'labels' in sample_data:\n", + " parsed_labels = parse_ground_truth_labels(sample_data['labels'])\n", + " if parsed_labels:\n", + " attributes_prompt = f\"expected attributes are: {json.dumps(parsed_labels, indent=2)}\"\n", + " else:\n", + " attributes_prompt = create_sample_attributes_prompt()\n", + " else:\n", + " attributes_prompt = create_sample_attributes_prompt()\n", + "\n", + " return {\n", + " \"classLabel\": CLASS_LABEL,\n", + " \"classPrompt\": class_prompt,\n", + " \"attributesPrompt\": attributes_prompt,\n", + " \"imagePath\": s3_image_uri,\n", + " }\n", + "\n", + "print(\"Helper functions defined\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Import Dataset to S3 Vectors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process a subset of the dataset (adjust as needed)\n", + "MAX_SAMPLES = 250 # Adjust this number based on your needs\n", + "BATCH_SIZE = 15 # Adjust this number based on your needs\n", + "\n", + "# Load the CSV labels (this contains the image_files information)\n", + "csv_df = load_csv_labels()\n", + "if csv_df is None:\n", + " print(\"Failed to load CSV data. Exiting.\")\n", + " raise Exception(\"CSV loading failed\")\n", + "\n", + "samples_to_process = min(MAX_SAMPLES, len(csv_df))\n", + "print(f\"Processing {samples_to_process} samples from FCC invoices CSV data...\")\n", + "\n", + "vectors_to_upload = []\n", + "failed_samples = []\n", + "\n", + "for i in tqdm(range(samples_to_process), desc=\"Processing samples\"):\n", + " try:\n", + " csv_row = csv_df.iloc[i]\n", + " \n", + " # Get image files from the CSV row\n", + " image_files_str = csv_row.get('image_files', '')\n", + " if not image_files_str:\n", + " print(f\"No image files found for sample {i}\")\n", + " failed_samples.append(i)\n", + " continue\n", + " \n", + " # Parse the image files array (it's stored as a JSON string)\n", + " import json\n", + " try:\n", + " image_files = json.loads(image_files_str)\n", + " except json.JSONDecodeError:\n", + " print(f\"Failed to parse image_files for sample {i}: {image_files_str}\")\n", + " failed_samples.append(i)\n", + " continue\n", + " \n", + " # Use the first image file (or you could process all images)\n", + " if not image_files:\n", + " print(f\"Empty image_files array for sample {i}\")\n", + " failed_samples.append(i)\n", + " continue\n", + " \n", + " # Load the first image file\n", + " image_file_path = image_files[0]\n", + " full_image_path = dataset_root_dir / image_file_path\n", + " \n", + " if not full_image_path.exists():\n", + " print(f\"Image file not found: {full_image_path}\")\n", + " failed_samples.append(i)\n", + " continue\n", + " \n", + " # Load image file as bytes\n", + " image_bytes = get_image_bytes_from_file(full_image_path)\n", + "\n", + " # Upload image to S3\n", + " s3_key = f\"fcc_invoices/sample_{i:06d}.jpg\"\n", + " s3_image_uri = upload_image_to_s3(image_bytes, s3_key)\n", + " \n", + " # Generate embedding\n", + " embedding = bedrock_client.generate_embedding(\n", + " image_source=image_bytes,\n", + " model_id=EMBEDDING_MODEL_ID,\n", + " dimensions=EMBEDDING_DIMENSIONS\n", + " )\n", + " \n", + " # Create metadata using the CSV row data\n", + " sample_data = {'labels': csv_row.get('labels')}\n", + " metadata = create_metadata(s3_image_uri, sample_data)\n", + "\n", + " # Prepare vector for upload\n", + " vector_entry = {\n", + " \"key\": f\"fcc_invoices_sample_{i:06d}\",\n", + " \"data\": {\"float32\": embedding},\n", + " \"metadata\": metadata\n", + " }\n", + "\n", + " vectors_to_upload.append(vector_entry)\n", + " \n", + " # Upload in batches to avoid memory issues\n", + " if len(vectors_to_upload) >= BATCH_SIZE:\n", + " print(f\"\\nUploading batch of {len(vectors_to_upload)} vectors...\")\n", + " response = s3vectors_client.put_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " vectors=vectors_to_upload\n", + " )\n", + " print(f\"Batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", + " vectors_to_upload = [] # Clear batch\n", + " \n", + " except Exception as e:\n", + " print(f\"\\nFailed to process sample {i}: {e}\")\n", + " failed_samples.append(i)\n", + " continue\n", + "\n", + "# Upload remaining vectors\n", + "if vectors_to_upload:\n", + " print(f\"\\nUploading final batch of {len(vectors_to_upload)} vectors...\")\n", + " response = s3vectors_client.put_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " vectors=vectors_to_upload\n", + " )\n", + " print(f\"Final batch upload response: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}\")\n", + "\n", + "print(f\"\\nImport completed!\")\n", + "print(f\"Successfully processed: {samples_to_process - len(failed_samples)} samples from CSV data\")\n", + "print(f\"Failed samples: {len(failed_samples)}\")\n", + "if failed_samples:\n", + " print(f\"Failed sample indices: {failed_samples[:10]}...\") # Show first 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Verify Import with Similarity Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load test split for similarity search verification\n", + "test_dataset = load_dataset('csv', data_dir=str(dataset_dir), split='test')\n", + "print(f\"Loaded test dataset with {len(test_dataset)} samples\")\n", + "\n", + "if test_dataset is not None and len(test_dataset) > 0:\n", + " # Use the first sample from test split\n", + " test_sample_index = 0\n", + " test_csv_row = test_dataset[test_sample_index]\n", + " \n", + " # Get test image from CSV row\n", + " test_image_files_str = test_csv_row.get('image_files', '')\n", + " if test_image_files_str:\n", + " try:\n", + " test_image_files = json.loads(test_image_files_str)\n", + " if test_image_files:\n", + " test_image_path = dataset_root_dir / test_image_files[0]\n", + " if test_image_path.exists():\n", + " test_image_bytes = get_image_bytes_from_file(test_image_path)\n", + " print(f\"Loaded test image: {test_image_files[0]}\")\n", + " else:\n", + " print(f\"Test image file not found: {test_image_path}\")\n", + " test_image_bytes = None\n", + " else:\n", + " print(\"Empty image_files array in test sample\")\n", + " test_image_bytes = None\n", + " except (json.JSONDecodeError, IndexError) as e:\n", + " print(f\"Failed to parse test image_files: {e}\")\n", + " test_image_bytes = None\n", + " else:\n", + " print(\"No image_files found in test sample\")\n", + " test_image_bytes = None\n", + "else:\n", + " print(\"Test split is empty or could not be loaded\")\n", + " test_image_bytes = None\n", + "\n", + "if test_image_bytes is not None:\n", + " print(f\"\\nTesting similarity search with test sample {test_sample_index}...\")\n", + "\n", + " # Generate embedding for test image\n", + " test_embedding = bedrock_client.generate_embedding(\n", + " image_source=test_image_bytes,\n", + " model_id=EMBEDDING_MODEL_ID,\n", + " dimensions=EMBEDDING_DIMENSIONS\n", + " )\n", + "else:\n", + " print(\"No test image available for similarity search verification.\")\n", + " test_embedding = None\n", + "\n", + "if test_embedding is not None:\n", + " # Query S3 Vectors for similar examples\n", + " response = s3vectors_client.query_vectors(\n", + " vectorBucketName=S3_VECTORS_BUCKET,\n", + " indexName=S3_VECTORS_INDEX,\n", + " queryVector={\"float32\": test_embedding},\n", + " topK=5,\n", + " returnDistance=True,\n", + " returnMetadata=True\n", + " )\n", + "\n", + " print(f\"\\nFound {len(response['vectors'])} similar examples:\")\n", + " for i, vector in enumerate(response['vectors']):\n", + " distance = vector.get('distance', 'N/A')\n", + " key = vector.get('key', 'N/A')\n", + " metadata = vector.get('metadata', {})\n", + " class_label = metadata.get('classLabel', 'N/A')\n", + " class_prompt = metadata.get('classPrompt', 'N/A')\n", + " attributes_prompt = metadata.get('attributesPrompt', 'N/A')\n", + " image_path = metadata.get('imagePath', 'N/A')\n", + " \n", + " print(f\" {i+1}. Key: {key}\")\n", + " print(f\" Distance: {distance:.4f}\")\n", + " print(f\" Class Label: {class_label}\")\n", + " print(f\" Class Prompt: {class_prompt}\")\n", + " print(f\" Attributes Prompt: {attributes_prompt[:100]}...\") # Truncate for readability\n", + " print(f\" Image Path: {image_path}\")\n", + " print()\n", + "else:\n", + " print(\"Skipping similarity search - no test embedding available.\")\n", + "\n", + "# Display source image and found similar images\n", + "if test_image_bytes is not None and 'response' in locals() and response.get('vectors'):\n", + " import matplotlib.pyplot as plt\n", + " from PIL import Image as PILImage\n", + " import io\n", + " \n", + " # Calculate number of images to display (source + top similar images)\n", + " num_similar = min(3, len(response['vectors'])) # Show top 3 similar images\n", + " total_images = 1 + num_similar # Source + similar images\n", + " \n", + " # Create subplot layout\n", + " fig, axes = plt.subplots(1, total_images, figsize=(5 * total_images, 6))\n", + " if total_images == 1:\n", + " axes = [axes] # Make it iterable for single image\n", + " \n", + " # Display source image\n", + " source_img = PILImage.open(io.BytesIO(test_image_bytes))\n", + " axes[0].imshow(source_img)\n", + " axes[0].set_title(f'Source Image (Test Sample {test_sample_index})', fontsize=12, fontweight='bold')\n", + " axes[0].axis('off')\n", + " \n", + " # Display similar images\n", + " for i, vector in enumerate(response['vectors'][:num_similar]):\n", + " try:\n", + " # Get image path from metadata\n", + " metadata = vector.get('metadata', {})\n", + " image_s3_path = metadata.get('imagePath', '')\n", + " distance = vector.get('distance', 0)\n", + " \n", + " if image_s3_path:\n", + " # Extract S3 key from the full S3 URI\n", + " s3_key = image_s3_path.replace(f's3://{GENAIIDP_S3_WORKING_BUCKET}/', '')\n", + " \n", + " # Download image from S3\n", + " try:\n", + " response_obj = s3_client.get_object(Bucket=GENAIIDP_S3_WORKING_BUCKET, Key=s3_key)\n", + " image_data = response_obj['Body'].read()\n", + " similar_img = PILImage.open(io.BytesIO(image_data))\n", + " \n", + " # Display the image\n", + " axes[i + 1].imshow(similar_img)\n", + " axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n", + " axes[i + 1].axis('off')\n", + " \n", + " except Exception as e:\n", + " # If can't load from S3, show placeholder\n", + " axes[i + 1].text(0.5, 0.5, f'Image not available\\n{str(e)[:50]}...', \n", + " ha='center', va='center', transform=axes[i + 1].transAxes)\n", + " axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n", + " axes[i + 1].axis('off')\n", + " else:\n", + " # No image path available\n", + " axes[i + 1].text(0.5, 0.5, 'No image path', ha='center', va='center', \n", + " transform=axes[i + 1].transAxes)\n", + " axes[i + 1].set_title(f'Similar #{i+1}\\nDistance: {distance:.3f}', fontsize=10)\n", + " axes[i + 1].axis('off')\n", + " \n", + " except Exception as e:\n", + " print(f'Error displaying similar image {i+1}: {e}')\n", + " axes[i + 1].text(0.5, 0.5, f'Error: {str(e)[:30]}...', ha='center', va='center', \n", + " transform=axes[i + 1].transAxes)\n", + " axes[i + 1].set_title(f'Similar #{i+1}', fontsize=10)\n", + " axes[i + 1].axis('off')\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + " print(f'\\nDisplayed source image and top {num_similar} similar images from the vector store.')\n", + " \n", + "else:\n", + " print('No images to display - either no test image was loaded or no similar images were found.')\n", + " if test_image_bytes is None:\n", + " print('Reason: No test image available')\n", + " elif 'response' not in locals():\n", + " print('Reason: No similarity search was performed')\n", + " elif not response.get('vectors'):\n", + " print('Reason: No similar images found in vector store')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Summary and Next Steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=== Few-shot Dataset Import Summary ===\")\n", + "print(f\"โœ… Dataset: FCC Invoices (REALKIE)\")\n", + "print(f\"โœ… Samples processed: {samples_to_process - len(failed_samples) if 'samples_to_process' in locals() and 'failed_samples' in locals() else 'N/A'}\")\n", + "print(f\"โœ… S3 Vectors Bucket: {S3_VECTORS_BUCKET}\")\n", + "print(f\"โœ… S3 Vectors Index: {S3_VECTORS_INDEX}\")\n", + "print(f\"โœ… Images stored in: s3://{GENAIIDP_S3_WORKING_BUCKET}/fcc_invoices/\")\n", + "print(f\"โœ… Embedding Model: {EMBEDDING_MODEL_ID}\")\n", + "print(f\"โœ… Similarity search verified\")\n", + "\n", + "print(\"\\n=== Next Steps ===\")\n", + "print(\"1. โœ… Updated attributes mapping to match actual FCC invoices dataset structure\")\n", + "print(\"2. โœ… Added ground truth label parsing from CSV data\")\n", + "print(\"3. Configure your IDP extraction to use the dynamic few-shot Lambda ARN\")\n", + "print(\"4. Test document processing with few-shot examples!\")\n", + "print(\"5. Fine-tune the label parsing logic if needed based on your specific use case\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From c8b3b2490e4213d4c58de4f424185504f4a7eb8a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:30:37 +0100 Subject: [PATCH 23/39] chore: use custom_prompt_lambda_arn parameter --- .../step3_extraction_with_dynamic_few_shot.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb index 5d5a0663..88879c64 100644 --- a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb +++ b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb @@ -14,13 +14,13 @@ "- Handle errors and monitor performance\n", "\n", "**Prerequisites:**\n", - "- Completed Step 2 (Classification)\n", + "- Completed Step 2 (Classification) (`notebooks/fewshot_dataset_import.ipynb`)\n", "- AWS Lambda permissions to create/invoke functions\n", "- Dynamic few-shot Lambda function deployed\n", - "- S3 Vectors index populated with examples (`notebooks/misc/fewshot_dataset_import.ipynb`)\n", + "- S3 Vectors index populated with examples (`notebooks/fewshot_dataset_import.ipynb`)\n", "\n", "**Key Feature:**\n", - "The `dynamic_few_shot_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting." + "The `custom_prompt_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting." ] }, { @@ -71,7 +71,7 @@ "outputs": [], "source": [ "# Load document from previous step\n", - "classification_data_dir = Path(\".data/step2_classification\")\n", + "classification_data_dir = Path(\"../../../notebooks/examples/.data/step2_classification\")\n", "\n", "# Load document object from JSON\n", "document_path = classification_data_dir / \"document.json\"\n", @@ -174,12 +174,12 @@ "source": [ "# Create configuration WITHOUT dynamic few-shot Lambda\n", "config_default = CONFIG.copy()\n", - "if 'dynamic_few_shot_lambda_arn' in config_default.get('extraction', {}):\n", - " del config_default['extraction']['dynamic_few_shot_lambda_arn']\n", + "if 'custom_prompt_lambda_arn' in config_default.get('extraction', {}):\n", + " del config_default['extraction']['custom_prompt_lambda_arn']\n", "\n", "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n", "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n", - "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('dynamic_few_shot_lambda_arn', 'None')}\")\n", + "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('custom_prompt_lambda_arn', 'None')}\")\n", "\n", "# Create extraction service with default config\n", "extraction_service_default = extraction.ExtractionService(config=config_default)\n", @@ -259,7 +259,7 @@ "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n", " # Create configuration WITH dynamic few-shot Lambda\n", " config_few_shot = CONFIG.copy()\n", - " config_few_shot['extraction']['dynamic_few_shot_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n", + " config_few_shot['extraction']['custom_prompt_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n", " \n", " print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n", " print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n", From 41b2a579810c2354b72f9372d35bac678ddc6dcc Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:30:59 +0100 Subject: [PATCH 24/39] chore: add classes configuration for step-by-step example --- .../notebooks/config/classes.yaml | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml new file mode 100644 index 00000000..e19746f9 --- /dev/null +++ b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml @@ -0,0 +1,119 @@ +classes: + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + LineItem: + type: object + properties: + LineItemEndDate: + default: 'null' + x-aws-idp-confidence-threshold: '0.8' + examples: + - 11/06/2012 + data_type: string + format: date + description: End date for each line item (typically in MM/DD/YY format) + type: string + x-aws-idp-evaluation-method: LEVENSHTEIN + x-aws-idp-evaluation-threshold: '0.7' + LineItemDescription: + data_type: string + description: Description of the line item + type: string + x-aws-idp-evaluation-method: LEVENSHTEIN + x-aws-idp-evaluation-threshold: '0.7' + LineItemStartDate: + default: 'null' + x-aws-idp-confidence-threshold: '0.8' + examples: + - 11/06/2012 + data_type: string + format: date + description: Start date for each line item (typically in MM/DD/YY format) + type: string + x-aws-idp-evaluation-method: LEVENSHTEIN + x-aws-idp-evaluation-threshold: '0.7' + LineItemDays: + maxItems: '7' + x-aws-idp-confidence-threshold: '0.8' + uniqueItems: true + description: List of days of the week for the line item + type: array + items: + type: string + data_type: string + enum: + - M + - T + - W + - Th + - F + - S + - Su + x-aws-idp-evaluation-method: EXACT + x-aws-idp-evaluation-threshold: '0.7' + LineItemRate: + data_type: string + description: Rate of the line item + x-aws-idp-confidence-threshold: '0.8' + type: number + x-aws-idp-evaluation-method: NUMERIC_EXACT + description: Invoice document + type: object + x-aws-idp-document-type: Invoice + properties: + LineItems: + type: array + description: List of line items in the invoice + items: + $ref: '#/$defs/LineItem' + Agency: + x-aws-idp-confidence-threshold: '0.8' + data_type: string + description: The advertising agency or station. May be labelled Agency, or Station. + x-aws-idp-evaluation-weight: '2' + type: string + x-aws-idp-evaluation-method: LEVENSHTEIN + x-aws-idp-evaluation-threshold: '0.7' + Advertiser: + x-aws-idp-confidence-threshold: '0.8' + data_type: string + description: The political advertiser or campaign purchasing the broadcast time + x-aws-idp-evaluation-weight: '2' + type: string + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-evaluation-threshold: '0.8' + GrossTotal: + data_type: string + description: >- + The total gross amount for all line items before any discounts or + adjustments + x-aws-idp-evaluation-weight: '2' + x-aws-idp-confidence-threshold: '0.8' + type: number + x-aws-idp-evaluation-method: NUMERIC_EXACT + PaymentTerms: + examples: + - Net 30 + data_type: string + description: Payment terms + x-aws-idp-evaluation-weight: '0.2' + type: string + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-evaluation-threshold: '0.7' + AgencyCommission: + data_type: string + description: Agency commission + x-aws-idp-evaluation-weight: '0.2' + x-aws-idp-confidence-threshold: '0.8' + type: number + x-aws-idp-evaluation-method: NUMERIC_EXACT + NetAmountDue: + data_type: string + description: >- + The final net amount due after any discounts or adjustments have been + applied (stored as string with commas) + x-aws-idp-evaluation-weight: '2' + x-aws-idp-confidence-threshold: '0.8' + type: number + x-aws-idp-evaluation-method: NUMERIC_EXACT + $id: Invoice From 0c2b1055af2717130f1ce782ff7dd425bae3d5a6 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Fri, 12 Dec 2025 17:40:48 +0100 Subject: [PATCH 25/39] chore: remove step-by-step extraction notebook --- .../notebooks/config/classes.yaml | 119 ----- .../config/extraction_with_few_shot.yaml | 101 ----- ...ep3_extraction_with_dynamic_few_shot.ipynb | 420 ------------------ 3 files changed, 640 deletions(-) delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml delete mode 100644 plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml deleted file mode 100644 index e19746f9..00000000 --- a/plugins/dynamic-few-shot-lambda/notebooks/config/classes.yaml +++ /dev/null @@ -1,119 +0,0 @@ -classes: - - $schema: https://json-schema.org/draft/2020-12/schema - $defs: - LineItem: - type: object - properties: - LineItemEndDate: - default: 'null' - x-aws-idp-confidence-threshold: '0.8' - examples: - - 11/06/2012 - data_type: string - format: date - description: End date for each line item (typically in MM/DD/YY format) - type: string - x-aws-idp-evaluation-method: LEVENSHTEIN - x-aws-idp-evaluation-threshold: '0.7' - LineItemDescription: - data_type: string - description: Description of the line item - type: string - x-aws-idp-evaluation-method: LEVENSHTEIN - x-aws-idp-evaluation-threshold: '0.7' - LineItemStartDate: - default: 'null' - x-aws-idp-confidence-threshold: '0.8' - examples: - - 11/06/2012 - data_type: string - format: date - description: Start date for each line item (typically in MM/DD/YY format) - type: string - x-aws-idp-evaluation-method: LEVENSHTEIN - x-aws-idp-evaluation-threshold: '0.7' - LineItemDays: - maxItems: '7' - x-aws-idp-confidence-threshold: '0.8' - uniqueItems: true - description: List of days of the week for the line item - type: array - items: - type: string - data_type: string - enum: - - M - - T - - W - - Th - - F - - S - - Su - x-aws-idp-evaluation-method: EXACT - x-aws-idp-evaluation-threshold: '0.7' - LineItemRate: - data_type: string - description: Rate of the line item - x-aws-idp-confidence-threshold: '0.8' - type: number - x-aws-idp-evaluation-method: NUMERIC_EXACT - description: Invoice document - type: object - x-aws-idp-document-type: Invoice - properties: - LineItems: - type: array - description: List of line items in the invoice - items: - $ref: '#/$defs/LineItem' - Agency: - x-aws-idp-confidence-threshold: '0.8' - data_type: string - description: The advertising agency or station. May be labelled Agency, or Station. - x-aws-idp-evaluation-weight: '2' - type: string - x-aws-idp-evaluation-method: LEVENSHTEIN - x-aws-idp-evaluation-threshold: '0.7' - Advertiser: - x-aws-idp-confidence-threshold: '0.8' - data_type: string - description: The political advertiser or campaign purchasing the broadcast time - x-aws-idp-evaluation-weight: '2' - type: string - x-aws-idp-evaluation-method: FUZZY - x-aws-idp-evaluation-threshold: '0.8' - GrossTotal: - data_type: string - description: >- - The total gross amount for all line items before any discounts or - adjustments - x-aws-idp-evaluation-weight: '2' - x-aws-idp-confidence-threshold: '0.8' - type: number - x-aws-idp-evaluation-method: NUMERIC_EXACT - PaymentTerms: - examples: - - Net 30 - data_type: string - description: Payment terms - x-aws-idp-evaluation-weight: '0.2' - type: string - x-aws-idp-evaluation-method: FUZZY - x-aws-idp-evaluation-threshold: '0.7' - AgencyCommission: - data_type: string - description: Agency commission - x-aws-idp-evaluation-weight: '0.2' - x-aws-idp-confidence-threshold: '0.8' - type: number - x-aws-idp-evaluation-method: NUMERIC_EXACT - NetAmountDue: - data_type: string - description: >- - The final net amount due after any discounts or adjustments have been - applied (stored as string with commas) - x-aws-idp-evaluation-weight: '2' - x-aws-idp-confidence-threshold: '0.8' - type: number - x-aws-idp-evaluation-method: NUMERIC_EXACT - $id: Invoice diff --git a/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml b/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml deleted file mode 100644 index addd9a01..00000000 --- a/plugins/dynamic-few-shot-lambda/notebooks/config/extraction_with_few_shot.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Extraction Service Configuration -extraction: - top_p: '0.1' - max_tokens: '4096' - top_k: '5' - temperature: '0.0' - model: us.amazon.nova-pro-v1:0 - system_prompt: >- - You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. - task_prompt: >- - - - You are an expert in document analysis and information extraction. - You can understand and extract key information from documents classified as type - - {DOCUMENT_CLASS}. - - - - - - - Your task is to take the unstructured text provided and convert it into a well-organized table format using JSON. Identify the main entities, attributes, or categories mentioned in the attributes list below and use them as keys in the JSON object. - Then, extract the relevant information from the text and populate the corresponding values in the JSON object. - - - - - - - Guidelines: - 1. Ensure that the data is accurately represented and properly formatted within - the JSON structure - 2. Include double quotes around all keys and values - 3. Do not make up data - only extract information explicitly found in the - document - 4. Do not use /n for new lines, use a space instead - 5. If a field is not found or if unsure, return null - 6. All dates should be in MM/DD/YYYY format - 7. Do not perform calculations or summations unless totals are explicitly given - 8. If an alias is not found in the document, return null - 9. Guidelines for checkboxes: - 9.A. CAREFULLY examine each checkbox, radio button, and selection field: - - Look for marks like โœ“, โœ—, x, filled circles (โ—), darkened areas, or handwritten checks indicating selection - - For checkboxes and multi-select fields, ONLY INCLUDE options that show clear visual evidence of selection - - DO NOT list options that have no visible selection mark - 9.B. For ambiguous or overlapping tick marks: - - If a mark overlaps between two or more checkboxes, determine which option contains the majority of the mark - - Consider a checkbox selected if the mark is primarily inside the check box or over the option text - - When a mark touches multiple options, analyze which option was most likely intended based on position and density. For handwritten checks, the mark typically flows from the selected checkbox outward. - - Carefully analyze visual cues and contextual hints. Think from a human perspective, anticipate natural tendencies, and apply thoughtful reasoning to make the best possible judgment. - 10. Think step by step first and then answer. - - - - If the attributes section below contains a list of attribute names and - descriptions, then output only those attributes, using the provided - descriptions as guidance for finding the correct values. - - - - {ATTRIBUTE_NAMES_AND_DESCRIPTIONS} - - - - - - {FEW_SHOT_EXAMPLES} - - - - <> - - - - - {DOCUMENT_TEXT} - - - - - - - {DOCUMENT_IMAGE} - - - - - - - Extract key information from the document and return a JSON object with the following key steps: - 1. Carefully analyze the document text to identify the requested attributes - 2. Extract only information explicitly found in the document - never make up data - 3. Format all dates as MM/DD/YYYY and replace newlines with spaces - 4. For checkboxes, only include options with clear visual selection marks - 5. Use null for any fields not found in the document - 6. Ensure the output is properly formatted JSON with quoted keys and values - 7. Think step by step before finalizing your answer - - - diff --git a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb b/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb deleted file mode 100644 index 88879c64..00000000 --- a/plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_dynamic_few_shot.ipynb +++ /dev/null @@ -1,420 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Step 3: Dynamic Few-Shot Information Extraction\n", - "\n", - "This notebook demonstrates the **dynamic few-shot prompting feature** for Pattern 2. It shows how to:\n", - "\n", - "- Configure dynamic few-shot Lambda functions extraction\n", - "- Compare default vs examples-enhanced extraction results\n", - "- Inspect Lambda payloads and responses\n", - "- Handle errors and monitor performance\n", - "\n", - "**Prerequisites:**\n", - "- Completed Step 2 (Classification) (`notebooks/fewshot_dataset_import.ipynb`)\n", - "- AWS Lambda permissions to create/invoke functions\n", - "- Dynamic few-shot Lambda function deployed\n", - "- S3 Vectors index populated with examples (`notebooks/fewshot_dataset_import.ipynb`)\n", - "\n", - "**Key Feature:**\n", - "The `custom_prompt_lambda_arn` configuration field allows you to dynamically retrieve similar examples using S3 Vectors similarity search to improve extraction accuracy through few-shot prompting." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup and Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import time\n", - "import logging\n", - "import boto3\n", - "from pathlib import Path\n", - "import yaml\n", - "\n", - "# Import IDP libraries\n", - "from idp_common.models import Document, Status\n", - "from idp_common.s3 import get_json_content\n", - "from idp_common import extraction\n", - "\n", - "# Configure logging to see Lambda invocation details\n", - "logging.basicConfig(level=logging.INFO)\n", - "logging.getLogger('idp_common.extraction').setLevel(logging.INFO)\n", - "logging.getLogger('idp_common.bedrock.client').setLevel(logging.INFO)\n", - "\n", - "print(\"Libraries imported successfully\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Load Previous Step Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load document from previous step\n", - "classification_data_dir = Path(\"../../../notebooks/examples/.data/step2_classification\")\n", - "\n", - "# Load document object from JSON\n", - "document_path = classification_data_dir / \"document.json\"\n", - "with open(document_path, 'r') as f:\n", - " document = Document.from_json(f.read())\n", - "\n", - "# Load configuration directly from config files\n", - "config_dir = Path(\"config\")\n", - "CONFIG = {}\n", - "\n", - "# Load each configuration file\n", - "config_files = [\n", - " \"extraction_with_few_shot.yaml\",\n", - " \"classes.yaml\"\n", - "]\n", - "\n", - "for config_file in config_files:\n", - " config_path = config_dir / config_file\n", - " if config_path.exists():\n", - " with open(config_path, 'r') as f:\n", - " file_config = yaml.safe_load(f)\n", - " CONFIG.update(file_config)\n", - " print(f\"Loaded {config_file}\")\n", - " else:\n", - " print(f\"Warning: {config_file} not found\")\n", - "\n", - "# Load environment info\n", - "env_path = classification_data_dir / \"environment.json\"\n", - "with open(env_path, 'r') as f:\n", - " env_info = json.load(f)\n", - "\n", - "# Set environment variables\n", - "os.environ['AWS_REGION'] = env_info['region']\n", - "os.environ['METRIC_NAMESPACE'] = 'IDP-Dynamic-Few-Shot'\n", - "\n", - "print(f\"Loaded document: {document.id}\")\n", - "print(f\"Document status: {document.status.value}\")\n", - "print(f\"Number of sections: {len(document.sections) if document.sections else 0}\")\n", - "print(f\"Loaded configuration sections: {list(CONFIG.keys())}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Configure Dynamic Few-Shot Lambda ARN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ๐Ÿ”ง CONFIGURATION: Set your dynamic few-shot Lambda ARN here\n", - "# Replace with your actual Lambda function ARN for live testing\n", - "\n", - "# Check if dynamic few-shot Lambda function exists\n", - "lambda_client = boto3.client('lambda')\n", - "DYNAMIC_FEW_SHOT_LAMBDA_ARN = None\n", - "\n", - "try:\n", - " response = lambda_client.get_function(FunctionName='GENAIIDP-dynamic-few-shot')\n", - " DYNAMIC_FEW_SHOT_LAMBDA_ARN = response['Configuration']['FunctionArn']\n", - " print(f\"โœ… Found dynamic few-shot Lambda function: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", - "except lambda_client.exceptions.ResourceNotFoundException:\n", - " print(\"โš ๏ธ Dynamic Few-Shot Lambda function not found: GENAIIDP-dynamic-few-shot\")\n", - " print(\"๐Ÿ’ก Deploy using: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n", - "except Exception as e:\n", - " print(f\"Error checking Lambda function: {e}\")\n", - "\n", - "if not DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n", - " print(\"โš ๏ธ No dynamic few-shot Lambda ARN configured\")\n", - " print(\"๐Ÿ’ก This demo will show standard extraction without few-shot examples\")\n", - " print(\"๐Ÿ”ง To test with examples, deploy the dynamic few-shot Lambda first\")\n", - "else:\n", - " print(f\"โœ… Dynamic few-shot Lambda ARN configured: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", - " print(\"๐Ÿš€ This demo will use few-shot examples from S3 Vectors\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Extraction Comparison: Default vs Dynamic Few-Shot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.1 Default Extraction (Without Dynamic Few-Shot)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create configuration WITHOUT dynamic few-shot Lambda\n", - "config_default = CONFIG.copy()\n", - "if 'custom_prompt_lambda_arn' in config_default.get('extraction', {}):\n", - " del config_default['extraction']['custom_prompt_lambda_arn']\n", - "\n", - "print(\"=== DEFAULT EXTRACTION CONFIGURATION ===\")\n", - "print(f\"Model: {config_default.get('extraction', {}).get('model')}\")\n", - "print(f\"Dynamic Few-Shot Lambda: {config_default.get('extraction', {}).get('custom_prompt_lambda_arn', 'None')}\")\n", - "\n", - "# Create extraction service with default config\n", - "extraction_service_default = extraction.ExtractionService(config=config_default)\n", - "print(\"\\nโœ… Default extraction service initialized\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run default extraction on first section\n", - "if document.sections:\n", - " first_section = document.sections[0]\n", - " print(f\"๐Ÿ”„ Processing section {first_section.section_id} with DEFAULT prompts\")\n", - " print(f\"Classification: {first_section.classification}\")\n", - " print(f\"Pages: {first_section.page_ids}\")\n", - " \n", - " # Save original document state\n", - " document_default = Document.from_json(document.to_json())\n", - " \n", - " # Process with default extraction\n", - " start_time = time.time()\n", - " document_default = extraction_service_default.process_document_section(\n", - " document=document_default,\n", - " section_id=first_section.section_id\n", - " )\n", - " default_extraction_time = time.time() - start_time\n", - " \n", - " print(f\"โœ… Default extraction completed in {default_extraction_time:.2f} seconds\")\n", - "\n", - " # Store results for comparison\n", - " default_section_result = None\n", - " for section in document_default.sections:\n", - " if section.section_id == first_section.section_id:\n", - " default_section_result = section\n", - " break\n", - " \n", - "else:\n", - " print(\"โš ๏ธ No sections found in document\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show section extraction result\n", - "if default_section_result:\n", - " print(f\"\\nSection {default_section_result.section_id} extraction result:\")\n", - " extraction_result_uri = default_section_result.extraction_result_uri\n", - "\n", - " if extraction_result_uri:\n", - " result = get_json_content(extraction_result_uri)\n", - " result_json = json.dumps(result[\"inference_result\"], indent=2)\n", - " print(result_json)\n", - "\n", - "else:\n", - " print(\"โš ๏ธ No sections found in document\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4.2 Dynamic Few-Shot Extraction using Lambda" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if DYNAMIC_FEW_SHOT_LAMBDA_ARN:\n", - " # Create configuration WITH dynamic few-shot Lambda\n", - " config_few_shot = CONFIG.copy()\n", - " config_few_shot['extraction']['custom_prompt_lambda_arn'] = DYNAMIC_FEW_SHOT_LAMBDA_ARN\n", - " \n", - " print(\"=== DYNAMIC FEW-SHOT EXTRACTION CONFIGURATION ===\")\n", - " print(f\"Model: {config_few_shot.get('extraction', {}).get('model')}\")\n", - " print(f\"Dynamic Few-Shot Lambda: {DYNAMIC_FEW_SHOT_LAMBDA_ARN}\")\n", - " print(f\"Lambda Function Name: {DYNAMIC_FEW_SHOT_LAMBDA_ARN.split(':')[-1]}\")\n", - " \n", - " # Create extraction service with dynamic few-shot config\n", - " extraction_service_few_shot = extraction.ExtractionService(config=config_few_shot)\n", - " \n", - " print(\"\\nโœ… Dynamic few-shot extraction service initialized\")\n", - " \n", - "else:\n", - " print(\"โš ๏ธ No dynamic few-shot Lambda ARN configured - skipping demonstration\")\n", - " config_few_shot = None\n", - " extraction_service_few_shot = None" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run dynamic few-shot extraction on first section\n", - "if DYNAMIC_FEW_SHOT_LAMBDA_ARN and document.sections:\n", - " first_section = document.sections[0]\n", - " print(f\"๐Ÿ”„ Processing section {first_section.section_id} with DYNAMIC FEW-SHOT\")\n", - " print(f\"Classification: {first_section.classification}\")\n", - " print(f\"Pages: {first_section.page_ids}\")\n", - " \n", - " # Create fresh document copy for examples processing\n", - " document_few_shot = Document.from_json(document.to_json())\n", - " \n", - " # Process with dynamic few-shot extraction\n", - " start_time = time.time()\n", - " \n", - " try:\n", - " document_few_shot = extraction_service_few_shot.process_document_section(\n", - " document=document_few_shot,\n", - " section_id=first_section.section_id\n", - " )\n", - " few_shot_extraction_time = time.time() - start_time\n", - " \n", - " print(f\"โœ… Dynamic few-shot extraction completed in {few_shot_extraction_time:.2f} seconds\")\n", - " \n", - " # Store results for comparison\n", - " few_shot_section_result = None\n", - " for section in document_few_shot.sections:\n", - " if section.section_id == first_section.section_id:\n", - " few_shot_section_result = section\n", - " break\n", - " \n", - " # Performance comparison\n", - " overhead = few_shot_extraction_time - default_extraction_time\n", - " print(f\"\\n๐Ÿ“Š Performance Comparison:\")\n", - " print(f\" Default: {default_extraction_time:.2f}s\")\n", - " print(f\" Dynamic Few-Shot: {few_shot_extraction_time:.2f}s\")\n", - " print(f\" Dynamic Few-Shot Overhead: {overhead:.2f}s ({overhead/default_extraction_time*100:.1f}% increase)\")\n", - " \n", - " except Exception as e:\n", - " print(f\"โŒ Dynamic few-shot extraction failed: {e}\")\n", - " print(\"\\n๐Ÿ” This demonstrates the fail-fast error handling behavior\")\n", - " few_shot_section_result = None\n", - " few_shot_extraction_time = None\n", - " \n", - "else:\n", - " print(\"โš ๏ธ Skipping dynamic few-shot extraction (no Lambda configured or no sections)\")\n", - " document_few_shot = None\n", - " few_shot_section_result = None\n", - " few_shot_extraction_time = None" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show section extraction result\n", - "if few_shot_section_result:\n", - " print(f\"\\nSection {few_shot_section_result.section_id} extraction result:\")\n", - " extraction_result_uri = few_shot_section_result.extraction_result_uri\n", - "\n", - " if extraction_result_uri:\n", - " result = get_json_content(extraction_result_uri)\n", - " result_json = json.dumps(result[\"inference_result\"], indent=2)\n", - " print(result_json)\n", - "\n", - "else:\n", - " print(\"โš ๏ธ No sections found in document\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Results and Summary" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"=== DEMO COMPLETE: SUMMARY ===\")\n", - "\n", - "sections_processed = 1 if document.sections else 0\n", - "dynamic_few_shot_used = DYNAMIC_FEW_SHOT_LAMBDA_ARN is not None\n", - "\n", - "print(f\"\\nโœ… DEMO RESULTS:\")\n", - "print(f\" ๐Ÿ“„ Document processed: {document.id}\")\n", - "print(f\" ๐Ÿ“Š Sections processed: {sections_processed}\")\n", - "print(f\" ๐Ÿ”ง Dynamic Few-Shot used: {'Yes' if dynamic_few_shot_used else 'No'}\")\n", - "\n", - "if dynamic_few_shot_used and 'few_shot_extraction_time' in locals() and examples_extraction_time:\n", - " print(f\" โฑ๏ธ Performance overhead: {few_shot_extraction_time - default_extraction_time:.2f}s\")\n", - " print(f\" ๐Ÿ“ˆ Accuracy improvement: Enhanced with few-shot examples\")\n", - "\n", - "print(f\"\\n๐Ÿš€ TO IMPLEMENT DYNAMIC FEW-SHOT IN PRODUCTION:\")\n", - "print(f\" 1. ๐Ÿ“ Deploy dynamic few-shot Lambda stack\")\n", - "print(f\" 2. ๐Ÿ“Š Populate S3 Vectors index with example documents\")\n", - "print(f\" 3. โš™๏ธ Add 'dynamic_few_shot_lambda_arn' to extraction config\")\n", - "print(f\" 4. ๐Ÿงช Test with your actual documents and use cases\")\n", - "print(f\" 5. ๐Ÿ“Š Monitor CloudWatch logs for performance and accuracy\")\n", - "\n", - "print(f\"\\n๐Ÿ“š RESOURCES:\")\n", - "print(f\" ๐Ÿ“– Documentation: notebooks/examples/dynamic-few-shot-lambda/README.md\")\n", - "print(f\" ๐Ÿ”ง Lambda Function: notebooks/examples/dynamic-few-shot-lambda/GENAIIDP-dynamic-few-shot.py\")\n", - "print(f\" โ˜๏ธ Deploy: cd notebooks/examples/dynamic-few-shot-lambda && sam deploy --guided\")\n", - "print(f\" ๐Ÿ“Š Import Dataset: notebooks/misc/fewshot_dataset_import.ipynb\")\n", - "\n", - "print(f\"\\n๐Ÿ“Œ CONTINUE TO: step4_assessment.ipynb\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 2d630ad4aa0b6bcb8de109795cbbdfbc8e508fde Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 16 Dec 2025 15:58:45 +0100 Subject: [PATCH 26/39] chore: fix step 3 extraction instructions --- plugins/dynamic-few-shot-lambda/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/README.md b/plugins/dynamic-few-shot-lambda/README.md index d38e5384..ec58b3f6 100644 --- a/plugins/dynamic-few-shot-lambda/README.md +++ b/plugins/dynamic-few-shot-lambda/README.md @@ -115,7 +115,7 @@ extraction: ### Step 5: Run the Demo Notebook 0. Run `notebooks/examples` steps 0, 1, 2 -1. Open `plugins/dynamic-few-shot-lambda/notebooks/step3_extraction_with_custom_lambda.ipynb` +1. Open `notebooks/examples/step3_extraction_with_custom_lambda.ipynb`. In section 3, set `DEMO_LAMBDA_ARN` to `arn:aws:lambda:region:account:function:GENAIIDP-dynamic-few-shot` 2. Run all cells to see the comparison ## Lambda Interface From daf70290d97e5a039b881628a560f8ab373ca9aa Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 16 Dec 2025 17:30:15 +0100 Subject: [PATCH 27/39] chore: cfn_nag allow * resource on its permissions policy --- plugins/dynamic-few-shot-lambda/template.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 71f47f81..b0dc8f6f 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -78,6 +78,8 @@ Resources: reason: "Demo function - does not require reserved concurrency as it scales based on demand" - id: W58 reason: "Demo function - DLQ not required" + - id: W11 + reason: "Demo function - allow * resource on its permissions policy" # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries" # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs" # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand" From f1ec3b9e355ea9bac2ef27dc25949447d6743594 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 16 Dec 2025 17:33:24 +0100 Subject: [PATCH 28/39] chore: validation for LogLevel --- plugins/dynamic-few-shot-lambda/template.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index b0dc8f6f..50959bb6 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -46,9 +46,16 @@ Parameters: Type: String Default: "GENAIIDP-dynamic-few-shot" + # Logging configuration LogLevel: Type: String Default: INFO + AllowedValues: + - DEBUG + - INFO + - WARN + - ERROR + Description: Default logging level GenAIIDPS3OutputBucketName: Type: String From b88ace71aca52164aabe6ebdafd09e784835c184 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 16 Dec 2025 17:34:49 +0100 Subject: [PATCH 29/39] chore: make LogRetentionDays as parameter --- plugins/dynamic-few-shot-lambda/template.yml | 27 +++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 50959bb6..895f5d76 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -57,6 +57,31 @@ Parameters: - ERROR Description: Default logging level + LogRetentionDays: + Type: Number + Default: 30 + Description: Number of days to retain CloudWatch logs + AllowedValues: + [ + 1, + 3, + 5, + 7, + 14, + 30, + 60, + 90, + 120, + 150, + 180, + 365, + 400, + 545, + 731, + 1827, + 3653, + ] + GenAIIDPS3OutputBucketName: Type: String Description: "GenAIIDP S3OutputBucketName" @@ -157,7 +182,7 @@ Resources: # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" Properties: LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}" - RetentionInDays: 7 # Short retention for demo purposes + RetentionInDays: !Ref LogRetentionDays DynamicFewShotVectorBucket: Type: AWS::S3Vectors::VectorBucket From d278154db48b97273d604e9f400c4358a8d952b3 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Tue, 16 Dec 2025 17:36:52 +0100 Subject: [PATCH 30/39] chore: use KMS key for log group --- plugins/dynamic-few-shot-lambda/template.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 895f5d76..77bc8b6c 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -174,15 +174,10 @@ Resources: DynamicFewShotLogGroup: Type: AWS::Logs::LogGroup - Metadata: - cfn_nag: - rules_to_suppress: - - id: W84 - reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" - # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" Properties: LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}" RetentionInDays: !Ref LogRetentionDays + KmsKeyId: !GetAtt GenAIIDPCustomerManagedEncryptionKeyArn DynamicFewShotVectorBucket: Type: AWS::S3Vectors::VectorBucket From 0115242f7389dff9322d477acaaaa6ec4ed0519d Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 11:58:13 +0100 Subject: [PATCH 31/39] chore: make bucket creation optional, add KMS key, add dataset bucket --- plugins/dynamic-few-shot-lambda/template.yml | 224 +++++++++++++++---- 1 file changed, 180 insertions(+), 44 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 77bc8b6c..ca57fa63 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -18,19 +18,27 @@ Parameters: VectorBucketName: Type: String - Default: "genaiidp-dynamic-few-shot" + Default: "" + Description: >- + (Optional) Existing S3 vectors bucket used. Provide the name of an existing S3 vectors + bucket here or leave blank to automatically create a new S3 vectors bucket. VectorIndexName: Type: String - Default: "documents" - - VectorDimensions: - Type: Number - Default: 3072 + Default: "" + Description: >- + (Optional) Existing S3 vectors index used. Provide the name of an existing S3 vectors + index here or leave blank to automatically create a new S3 vectors index. ModelId: Type: String Default: "amazon.nova-2-multimodal-embeddings-v1:0" + Description: Vector embedding model to use to create meaningful vector representations of documents + + VectorDimensions: + Type: Number + Default: 3072 + Description: Vector embedding length to use, as defined by the embedding model in use TopK: Type: Number @@ -44,7 +52,14 @@ Parameters: LambdaFunctionName: Type: String - Default: "GENAIIDP-dynamic-few-shot" + Default: "IDP-dynamic-few-shot" + + DatasetBucketName: + Type: String + Default: "" + Description: >- + (Optional) Existing bucket used for dynamic few-shot datasets. Provide the name of + an existing bucket here or leave blank to automatically create a new bucket. # Logging configuration LogLevel: @@ -82,20 +97,16 @@ Parameters: 3653, ] - GenAIIDPS3OutputBucketName: - Type: String - Description: "GenAIIDP S3OutputBucketName" - - GenAIIDPS3WorkingBucketName: - Type: String - Description: "GenAIIDP WorkingBucket Name" - - GenAIIDPCustomerManagedEncryptionKeyArn: + # GenAI IDP parameters + IDPS3LoggingBucketName: Type: String - Description: "GenAIIDP CustomerManagedEncryptionKey ARN" + Description: "IDP LoggingBucket Name" Conditions: HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]] + ShouldCreateVectorBucket: !Equals [ !Ref VectorBucketName, "" ] + ShouldCreateVectorIndex: !Equals [ !Ref VectorIndexName, "" ] + ShouldCreateDatasetBucket: !Equals [ !Ref DatasetBucketName, "" ] Resources: @@ -126,12 +137,22 @@ Resources: - arm64 Timeout: 300 MemorySize: 512 - Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting + Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors Environment: Variables: LOG_LEVEL: !Ref LogLevel - S3VECTOR_BUCKET: !Ref VectorBucketName - S3VECTOR_INDEX: !Ref VectorIndexName + S3VECTOR_BUCKET: !If + - ShouldCreateVectorBucket + # Error: Requested attribute VectorBucketName must be a readonly property in schema for AWS::S3Vectors::VectorBucket + # - !GetAtt DynamicFewShotVectorBucket.VectorBucketName + - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]] + - !Ref VectorBucketName + S3VECTOR_INDEX: !If + - ShouldCreateVectorIndex + # Error: Requested attribute IndexName must be a readonly property in schema for AWS::S3Vectors::Index + # - !GetAtt DocumentsIndex.IndexName + - !Select [3, !Split ["/", !Ref DocumentsIndex]] + - !Ref VectorIndexName S3VECTOR_DIMENSIONS: !Ref VectorDimensions MODEL_ID: !Ref ModelId TOP_K: !Ref TopK @@ -142,9 +163,10 @@ Resources: Policies: - AWSLambdaBasicExecutionRole - S3ReadPolicy: - BucketName: !Ref GenAIIDPS3OutputBucketName - - S3ReadPolicy: - BucketName: !Ref GenAIIDPS3WorkingBucketName + BucketName: !If + - ShouldCreateDatasetBucket + - !Ref DatasetBucket + - !Ref DatasetBucketName - Statement: - Effect: Allow Action: cloudwatch:PutMetricData @@ -161,7 +183,13 @@ Resources: - s3vectors:GetVectors - s3vectors:QueryVectors Resource: - - !Ref DynamicFewShotVectorIndex + - !If + - ShouldCreateVectorIndex + - !Ref DocumentsIndex + - !If + - ShouldCreateVectorBucket + - !Sub "${DynamicFewShotVectorBucket}/index/${DocumentsIndex}" + - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}" - Effect: Allow Action: - kms:Encrypt @@ -170,32 +198,27 @@ Resources: - kms:GenerateDataKey* - kms:DescribeKey Resource: - - !Ref GenAIIDPCustomerManagedEncryptionKeyArn + - !GetAtt CustomerManagedEncryptionKey.Arn DynamicFewShotLogGroup: Type: AWS::Logs::LogGroup Properties: LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}" RetentionInDays: !Ref LogRetentionDays - KmsKeyId: !GetAtt GenAIIDPCustomerManagedEncryptionKeyArn + KmsKeyId: !GetAtt CustomerManagedEncryptionKey.Arn DynamicFewShotVectorBucket: Type: AWS::S3Vectors::VectorBucket - Metadata: - cfn_nag: - rules_to_suppress: - - id: W84 - reason: "Demo function - KMS CMK not required, but can be added by customer for production use cases" - # checkov:skip=CKV_AWS_158: "Demo function - KMS CMK not required, but can be added by customer for production use cases" + Condition: ShouldCreateVectorBucket Properties: - VectorBucketName: !Ref VectorBucketName EncryptionConfiguration: - SseType: "AES256" + SseType: "aws:kms" + KmsKeyArn: !GetAtt CustomerManagedEncryptionKey.Arn - DynamicFewShotVectorIndex: + DocumentsIndex: Type: AWS::S3Vectors::Index + Condition: ShouldCreateVectorIndex Properties: - IndexName: !Ref VectorIndexName DataType: "float32" Dimension: !Ref VectorDimensions DistanceMetric: "cosine" @@ -204,7 +227,111 @@ Resources: - "classPrompt" - "attributesPrompt" - "imagePath" - VectorBucketArn: !Ref DynamicFewShotVectorBucket + VectorBucketName: !If + - ShouldCreateVectorBucket + - !Ref AWS::NoValue + - VectorBucketName + VectorBucketArn: !If + - ShouldCreateVectorBucket + - !Ref DynamicFewShotVectorBucket + - !Ref AWS::NoValue + + DatasetBucket: + Type: AWS::S3::Bucket + Condition: ShouldCreateDatasetBucket + DeletionPolicy: RetainExceptOnCreate + Properties: + BucketEncryption: + ServerSideEncryptionConfiguration: + - ServerSideEncryptionByDefault: + SSEAlgorithm: aws:kms + KMSMasterKeyID: !Ref CustomerManagedEncryptionKey + PublicAccessBlockConfiguration: + BlockPublicAcls: true + BlockPublicPolicy: true + IgnorePublicAcls: true + RestrictPublicBuckets: true + VersioningConfiguration: + Status: Enabled + LoggingConfiguration: + DestinationBucketName: !Ref IDPS3LoggingBucketName + LogFilePrefix: fewshot-dataset-bucket-logs/ + + DatasetBucketPolicy: + Type: AWS::S3::BucketPolicy + Condition: ShouldCreateDatasetBucket + Properties: + Bucket: !Ref DatasetBucket + PolicyDocument: + Version: "2012-10-17" + Statement: + - Sid: EnforceSSLOnly + Effect: Deny + Principal: "*" + Action: "s3:*" + Resource: + - !Sub "${DatasetBucket.Arn}/*" + - !Sub "${DatasetBucket.Arn}" + Condition: + Bool: + "aws:SecureTransport": false + + CustomerManagedEncryptionKey: + Type: AWS::KMS::Key + Metadata: + security-matrix: + rules_to_suppress: + - id: IAM-005 + reason: "No cross-account access - only same account root and AWS services" + - id: KMS-007 + reason: "KMS monitoring not required for this IDP solution - comprehensive CloudWatch monitoring already in place" + - id: KMS-002 + reason: "kms:* permission for account root is standard pattern for administrative access to KMS keys" + Properties: + Description: KMS key for encryption of dynamic few-shot resources + EnableKeyRotation: true + KeyPolicy: + Version: "2012-10-17" + Statement: + - Sid: Enable IAM User Permissions + Effect: Allow + Principal: + AWS: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:root" + Action: kms:* + Resource: "*" + - Sid: Allow lambda to access the Keys + Effect: Allow + Principal: + AWS: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:root" + Action: + - kms:Encrypt + - kms:Decrypt + - kms:ReEncrypt* + - kms:GenerateDataKey* + - kms:DescribeKey + Resource: "*" + - Sid: Allow CloudWatch Logs to use the key + Effect: Allow + Principal: + Service: !Sub "logs.${AWS::URLSuffix}" + Action: + - kms:Encrypt + - kms:Decrypt + - kms:ReEncrypt* + - kms:GenerateDataKey* + - kms:DescribeKey + Resource: "*" + - Sid: Allow S3 Vectors indexing service to use the key + Effect: Allow + Principal: + Service: !Sub "indexing.s3vectors.${AWS::URLSuffix}" + Action: + - kms:Encrypt + - kms:Decrypt + - kms:ReEncrypt* + - kms:GenerateDataKey* + - kms:DescribeKey + Resource: "*" Outputs: @@ -220,17 +347,26 @@ Outputs: Description: CloudWatch Log Group for monitoring demo Lambda execution Value: !Ref DynamicFewShotLogGroup - DynamicFewShotVectorBucketArn: + VectorBucketName: Description: S3 Vectors bucket for dynamic few-shot examples - Value: !Ref DynamicFewShotVectorBucket + Value: !If + - ShouldCreateVectorBucket + - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]] + - !Ref VectorBucketName - DynamicFewShotVectorIndexArn: + VectorIndexName: Description: S3 Vectors index for dynamic few-shot examples - Value: !Ref DynamicFewShotVectorIndex + Value: !If + - ShouldCreateVectorIndex + - !Select [3, !Split ["/", !Ref DocumentsIndex]] + - !Ref VectorIndexName - DynamicFewShotDatasetBucket: - Description: S3 Bucket for example data sets - Value: !Ref DynamicFewShotDatasetBucket + DatasetBucket: + Description: S3 bucket for example data sets + Value: !If + - ShouldCreateDatasetBucket + - !Ref DatasetBucket + - !Ref DatasetBucketName UsageInstructions: Description: How to use this Lambda in your IDP configuration From 0835cdc0953d0e3475ce2a7cdf8a859e2bfa9b3a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:18:55 +0100 Subject: [PATCH 32/39] chore: allow access to IDP output bucket --- ...ic-few-shot.py => IDP-dynamic-few-shot.py} | 0 plugins/dynamic-few-shot-lambda/template.yml | 38 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) rename plugins/dynamic-few-shot-lambda/src/{GENAIIDP-dynamic-few-shot.py => IDP-dynamic-few-shot.py} (100%) diff --git a/plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py similarity index 100% rename from plugins/dynamic-few-shot-lambda/src/GENAIIDP-dynamic-few-shot.py rename to plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index ca57fa63..7bbe1206 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -100,7 +100,18 @@ Parameters: # GenAI IDP parameters IDPS3LoggingBucketName: Type: String - Description: "IDP LoggingBucket Name" + Description: + IDP LoggingBucket Name, to store access logs for the dataset bucket + + IDPS3OutputBucketName: + Type: String + Description: >- + IDP S3OutputBucketName, to read the documents being processed + + IDPCustomerManagedEncryptionKeyArn: + Type: String + Description: >- + IDP CustomerManagedEncryptionKey ARN, to decrypt documents being read from the output bucket Conditions: HasPermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryArn, ""]] @@ -131,7 +142,7 @@ Resources: FunctionName: !Ref LambdaFunctionName PermissionsBoundary: !If [HasPermissionsBoundary, !Ref PermissionsBoundaryArn, !Ref AWS::NoValue] CodeUri: ./src - Handler: GENAIIDP-dynamic-few-shot.lambda_handler + Handler: IDP-dynamic-few-shot.lambda_handler Runtime: python3.12 Architectures: - arm64 @@ -144,8 +155,8 @@ Resources: S3VECTOR_BUCKET: !If - ShouldCreateVectorBucket # Error: Requested attribute VectorBucketName must be a readonly property in schema for AWS::S3Vectors::VectorBucket - # - !GetAtt DynamicFewShotVectorBucket.VectorBucketName - - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]] + # - !GetAtt VectorBucket.VectorBucketName + - !Select [1, !Split ["/", !Ref VectorBucket]] - !Ref VectorBucketName S3VECTOR_INDEX: !If - ShouldCreateVectorIndex @@ -167,6 +178,8 @@ Resources: - ShouldCreateDatasetBucket - !Ref DatasetBucket - !Ref DatasetBucketName + - S3ReadPolicy: + BucketName: !Ref IDPS3OutputBucketName - Statement: - Effect: Allow Action: cloudwatch:PutMetricData @@ -188,17 +201,14 @@ Resources: - !Ref DocumentsIndex - !If - ShouldCreateVectorBucket - - !Sub "${DynamicFewShotVectorBucket}/index/${DocumentsIndex}" + - !Sub "${VectorBucket}/index/${DocumentsIndex}" - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}" - Effect: Allow Action: - - kms:Encrypt - kms:Decrypt - - kms:ReEncrypt* - - kms:GenerateDataKey* - - kms:DescribeKey Resource: - !GetAtt CustomerManagedEncryptionKey.Arn + - !Ref IDPCustomerManagedEncryptionKeyArn DynamicFewShotLogGroup: Type: AWS::Logs::LogGroup @@ -207,7 +217,7 @@ Resources: RetentionInDays: !Ref LogRetentionDays KmsKeyId: !GetAtt CustomerManagedEncryptionKey.Arn - DynamicFewShotVectorBucket: + VectorBucket: Type: AWS::S3Vectors::VectorBucket Condition: ShouldCreateVectorBucket Properties: @@ -229,12 +239,8 @@ Resources: - "imagePath" VectorBucketName: !If - ShouldCreateVectorBucket - - !Ref AWS::NoValue + - !Select [1, !Split ["/", !Ref VectorBucket]] - VectorBucketName - VectorBucketArn: !If - - ShouldCreateVectorBucket - - !Ref DynamicFewShotVectorBucket - - !Ref AWS::NoValue DatasetBucket: Type: AWS::S3::Bucket @@ -351,7 +357,7 @@ Outputs: Description: S3 Vectors bucket for dynamic few-shot examples Value: !If - ShouldCreateVectorBucket - - !Select [1, !Split ["/", !Ref DynamicFewShotVectorBucket]] + - !Select [1, !Split ["/", !Ref VectorBucket]] - !Ref VectorBucketName VectorIndexName: From b05827cd054614dda0376fac181ec7a112fb0424 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:19:55 +0100 Subject: [PATCH 33/39] chore: fix samconfig.toml --- plugins/dynamic-few-shot-lambda/samconfig.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/samconfig.toml b/plugins/dynamic-few-shot-lambda/samconfig.toml index e25430e5..ce714fd8 100644 --- a/plugins/dynamic-few-shot-lambda/samconfig.toml +++ b/plugins/dynamic-few-shot-lambda/samconfig.toml @@ -1,11 +1,10 @@ version = 0.1 [default.deploy.parameters] -stack_name = "GENAIIDP-dynamic-few-shot-stack" +stack_name = "IDP-dynamic-few-shot" resolve_s3 = true -s3_prefix = "GENAIIDP-dynamic-few-shot-stack" +s3_prefix = "IDP-dynamic-few-shot" region = "us-east-1" capabilities = "CAPABILITY_IAM" disable_rollback = true -parameter_overrides = "PermissionsBoundaryArn=\"\" VectorBucketName=\"genaiidp-dynamic-few-shot\" VectorIndexName=\"documents\" VectorDimensions=\"3072\" ModelId=\"amazon.nova-2-multimodal-embeddings-v1:0\" TopK=\"2\" LambdaFunctionName=\"GENAIIDP-dynamic-few-shot\"" image_repositories = [] From c2e5a14f36dff714405520227241bdad632bfd84 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:29:47 +0100 Subject: [PATCH 34/39] chore: add reasoning for cfn_nag --- plugins/dynamic-few-shot-lambda/template.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 7bbe1206..3dee77ad 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -127,14 +127,14 @@ Resources: cfn_nag: rules_to_suppress: - id: W89 - reason: "Demo function - does not require VPC access" + reason: "Function does not require VPC access as it only interacts with AWS services via APIs" - id: W92 - reason: "Demo function - does not require reserved concurrency as it scales based on demand" + reason: "Function does not require reserved concurrency as it scales based on demand" - id: W58 - reason: "Demo function - DLQ not required" + reason: "Function does not require DLQ as processing and retries are handled by the IDP framework" - id: W11 - reason: "Demo function - allow * resource on its permissions policy" - # checkov:skip=CKV_AWS_116: "DLQ not required for AppSync resolver function as GraphQL handles retries" + reason: "Allow * resource on its permissions policy for CloudWatch metrics" + # checkov:skip=CKV_AWS_116: "Function does not require DLQ" # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs" # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand" # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings" From c477c444f393b7149ce02ed9da709381621d8e9a Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:31:00 +0100 Subject: [PATCH 35/39] chore: add more reasoning --- plugins/dynamic-few-shot-lambda/template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 3dee77ad..fe1a1635 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -134,7 +134,7 @@ Resources: reason: "Function does not require DLQ as processing and retries are handled by the IDP framework" - id: W11 reason: "Allow * resource on its permissions policy for CloudWatch metrics" - # checkov:skip=CKV_AWS_116: "Function does not require DLQ" + # checkov:skip=CKV_AWS_116: "Function does not require DLQ as processing and retries are handled by the IDP framework" # checkov:skip=CKV_AWS_117: "Function does not require VPC access as it only interacts with AWS services via APIs" # checkov:skip=CKV_AWS_115: "Function does not require reserved concurrency as it scales based on demand" # checkov:skip=CKV_AWS_173: "Environment variables do not contain sensitive data - only configuration values like feature flags and non-sensitive settings" From 1f8eb82bd5f189e3a445bffa3d7f06cc36dfd29d Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:37:55 +0100 Subject: [PATCH 36/39] chore: decode base64 images --- lib/idp_common_pkg/idp_common/extraction/service.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py index ead4e0a1..2bf21e77 100644 --- a/lib/idp_common_pkg/idp_common/extraction/service.py +++ b/lib/idp_common_pkg/idp_common/extraction/service.py @@ -10,6 +10,7 @@ from __future__ import annotations +import base64 import json import logging import os @@ -461,6 +462,13 @@ def _convert_image_uris_to_bytes_in_content( f"Invalid file path {image_uri} - expecting S3 path" ) + converted_item = image.prepare_bedrock_image_attachment(image_bytes) + elif "image_base64" in item: + image_base64 = item["image_base64"] + + # Decode image content + image_bytes = base64.b64decode(image_base64) + converted_item = image.prepare_bedrock_image_attachment(image_bytes) elif "image" in item: # Keep existing image objects as-is From 7dd9f057ae8679684a4f859d0a62d5b94f833528 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:42:56 +0100 Subject: [PATCH 37/39] chore: return base64 encoded images instead of image_uri --- .../src/IDP-dynamic-few-shot.py | 11 ++++++++++- plugins/dynamic-few-shot-lambda/template.yml | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py index 49aab6c3..1cf760e5 100644 --- a/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py +++ b/plugins/dynamic-few-shot-lambda/src/IDP-dynamic-few-shot.py @@ -218,7 +218,16 @@ def _build_text_and_image_content( # Add images if image_content: for image_uri in image_content: - content.append({"image_uri": image_uri}) + # Load image content + if image_uri.startswith("s3://"): + # Direct S3 URI + image_bytes = s3.get_binary_content(image_uri) + else: + raise ValueError(f"Invalid file path {image_path} - expecting S3 path") + + # Convert bytes to base64 string + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + content.append({"image_base64": image_base64}) # Add text after image after_text = _prepare_prompt_from_template( diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index fe1a1635..b7c2548b 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -148,7 +148,7 @@ Resources: - arm64 Timeout: 300 MemorySize: 512 - Description: Demo Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors + Description: Lambda function for GenAI IDP dynamic few-shot prompting using S3 Vectors Environment: Variables: LOG_LEVEL: !Ref LogLevel From 800ed17b140fd7de223a18806afec4465095b339 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 13:48:57 +0100 Subject: [PATCH 38/39] chore: fix parameter --- plugins/dynamic-few-shot-lambda/template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index b7c2548b..44405564 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -379,7 +379,7 @@ Outputs: Value: !Sub | Add this ARN to your extraction config: extraction: - dynamic_few_shot_lambda_arn: "${DynamicFewShotFunction.Arn}" + custom_prompt_lambda_arn: "${DynamicFewShotFunction.Arn}" MonitoringLink: Description: Direct link to CloudWatch logs for this function From 2eb8573cb493a42074556f628cd5e0437b85eba5 Mon Sep 17 00:00:00 2001 From: Daniel Lorch Date: Wed, 17 Dec 2025 14:08:35 +0100 Subject: [PATCH 39/39] chore: fix permission policy for s3 vectors --- plugins/dynamic-few-shot-lambda/template.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/dynamic-few-shot-lambda/template.yml b/plugins/dynamic-few-shot-lambda/template.yml index 44405564..25184437 100644 --- a/plugins/dynamic-few-shot-lambda/template.yml +++ b/plugins/dynamic-few-shot-lambda/template.yml @@ -201,8 +201,8 @@ Resources: - !Ref DocumentsIndex - !If - ShouldCreateVectorBucket - - !Sub "${VectorBucket}/index/${DocumentsIndex}" - - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${DocumentsIndex}" + - !Sub "${VectorBucket}/index/${VectorIndexName}" + - !Sub "arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucketName}/index/${VectorIndexName}" - Effect: Allow Action: - kms:Decrypt