aws-solutions-library-samples · lorchda · Nov 28, 2025 · Nov 28, 2025 · Dec 11, 2025 · Nov 28, 2025
diff --git a/lib/idp_common_pkg/idp_common/bedrock/README.md b/lib/idp_common_pkg/idp_common/bedrock/README.md
@@ -73,6 +73,47 @@ embedding = client.generate_embedding(
 # Use embedding for vector search, clustering, etc.
 ```
 
+Amazon Titan Multimodal Embeddings support both text and image at the same time. The resulting embeddings vector averages the text embeddings and image embeddings vectors.
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+with open("/path/to/document.png", "rb") as image_file:
+    image_data = image_file.read()
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    text="This document contains information about loan applications.",
+    image_source=image_data,
+    model_id="amazon.titan-embed-image-v1"
+)
+```
+
+The image source can also be an S3 URI:
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    image_data="s3://bucket/key",
+    model_id="amazon.titan-embed-image-v1"
+)
+```
+
+Amazon Nova Multimodal Embeddings with 3072 dimension size:
+
+```python
+from idp_common.bedrock.client import BedrockClient
+
+client = BedrockClient()
+embedding = client.generate_embedding(
+    image_data="s3://bucket/key",
+    model_id="amazon.nova-2-multimodal-embeddings-v1:0",
+    dimensions=3072
+)
+```
+
 ## Prompt Caching with CachePoint
 
 Prompt caching is a powerful feature in Amazon Bedrock that significantly reduces response latency for workloads with repetitive contexts. The Bedrock client provides built-in support for this via the `<<CACHEPOINT>>` tag.

diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py
@@ -16,6 +16,7 @@
 import copy
 import random
 import socket
+import base64
 from typing import Dict, Any, List, Optional, Union, Tuple, Type
 from botocore.config import Config
 from botocore.exceptions import (
@@ -26,7 +27,6 @@
 )
 from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError
 
-
 # Dummy exception classes for requests timeouts if requests is not available
 class _RequestsReadTimeout(Exception):
     """Fallback exception class when requests library is not available."""
@@ -711,22 +711,35 @@ def get_guardrail_config(self) -> Optional[Dict[str, str]]:
 
     def generate_embedding(
         self,
-        text: str,
+        text: str = "",
+        image_source: Optional[Union[str, bytes]] = None,
         model_id: str = "amazon.titan-embed-text-v1",
+        dimensions: int = 1024,
         max_retries: Optional[int] = None,
     ) -> List[float]:
         """
-        Generate an embedding vector for the given text using Amazon Bedrock.
+        Generate an embedding vector for the given text or image_source using Amazon Bedrock.
+        At least one of text or the image is required to generate the embedding.
+        For Titan Multimodal embedding models, you can include both to create an embeddings query vector that averages the resulting text embeddings and image embeddings vectors.
+        For Nova Multimodal embedding models, exactly one of text or the image must be present, but not both.
 
         Args:
             text: The text to generate embeddings for
+            image_source: The image to generate embeddings for (can be either an S3 URI (s3://bucket/key) or raw image bytes)
             model_id: The embedding model ID to use (default: amazon.titan-embed-text-v1)
             max_retries: Optional override for the instance's max_retries setting
+            dimensions: Length of the output embeddings vector
 
         Returns:
             List of floats representing the embedding vector
         """
-        if not text or not isinstance(text, str):
+        # requires PIL
+        from idp_common.image import (
+            prepare_image,
+            prepare_bedrock_image_attachment
+        )
+
+        if (not text or not isinstance(text, str)) and (not image_source):
             # Return an empty vector for empty input
             return []
 
@@ -741,12 +754,61 @@ def generate_embedding(
         # Normalize whitespace and prepare the input text
         normalized_text = " ".join(text.split())
 
+        # Convert image to base64
+        if image_source:
+            image_bytes = prepare_image(image_source)
+            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+
+        dimensions = int(dimensions)
+
         # Prepare the request body based on the model
-        if "amazon.titan-embed" in model_id:
-            request_body = json.dumps({"inputText": normalized_text})
+        payload_body: Dict[str, Any] = {}
+
+        if "amazon.titan-embed-text" in model_id:
+            if not normalized_text:
+                raise ValueError(
+                    "Amazon Titan Text models require a text parameter to generate embeddings for."
+                )
+            payload_body = {
+                "inputText": normalized_text,
+                "dimensions":  dimensions,
+            }
+        elif "amazon.titan-embed-image" in model_id:
+            payload_body = {
+                "embeddingConfig": {
+                    "outputEmbeddingLength": dimensions,
+                }
+            }
+            if normalized_text:
+                payload_body["inputText"] = normalized_text
+            if image_base64:
+                payload_body["inputImage"] = image_base64
+        elif "amazon.nova-2-multimodal-embeddings" in model_id:
+            if normalized_text and image_source:
+                raise ValueError(
+                    "Amazon Nova Multimodal Embedding models require exactly one of text or image parameter, but noth both at the same time."
+                )
+            payload_body = {
+                "taskType": "SINGLE_EMBEDDING",
+                "singleEmbeddingParams": {
+                    "embeddingPurpose": "GENERIC_INDEX",
+                    "embeddingDimension": dimensions,
+                }
+            }
+            if normalized_text:
+                payload_body["singleEmbeddingParams"]["text"] = {"truncationMode": "END", "value": normalized_text}
+            if image_source:
+                payload_body["singleEmbeddingParams"].update(prepare_bedrock_image_attachment(image_bytes)) # detect image format
+                payload_body["singleEmbeddingParams"]["image"]["source"]["bytes"] = image_base64
         else:
             # Default format for other models
-            request_body = json.dumps({"text": normalized_text})
+            if not normalized_text:
+                raise ValueError(
+                    "Default format requires a text parameter to generate embeddings for."
+                )
+            payload_body = {"text": normalized_text}
+
+        request_body = json.dumps(payload_body)
 
         # Call the recursive embedding function
         return self._generate_embedding_with_retry(
@@ -805,6 +867,10 @@ def _generate_embedding_with_retry(
             # Handle different response formats based on the model
             if "amazon.titan-embed" in model_id:
                 embedding = response_body.get("embedding", [])
+            elif "amazon.titan-embed-image" in model_id:
+                embedding = response_body.get("embedding", [])
+            elif "amazon.nova-2-multimodal-embeddings" in model_id:
+                embedding = response_body["embeddings"][0]["embedding"]
             else:
                 # Default extraction format
                 embedding = response_body.get("embedding", [])

diff --git a/lib/idp_common_pkg/idp_common/extraction/service.py b/lib/idp_common_pkg/idp_common/extraction/service.py
@@ -10,6 +10,7 @@
 
 from __future__ import annotations
 
+import base64
 import json
 import logging
 import os
@@ -433,6 +434,53 @@ def _make_json_serializable(self, obj: Any) -> Any:
                 # Convert non-serializable objects to string representation
                 return str(obj)
 
+    def _convert_image_uris_to_bytes_in_content(
+        self, content: list[dict[str, Any]]
+    ) -> list[dict[str, Any]]:
+        """
+        Convert image URIs back to bytes in content array after Lambda processing.
+
+        Args:
+            content: Content array from Lambda that may contain image URIs
+
+        Returns:
+            Content array with image bytes restored
+        """
+        converted_content = []
+
+        for item in content:
+            if "image_uri" in item:
+                image_uri = item["image_uri"]
+
+                # Load image content
+                if image_uri.startswith("s3://"):
+                    # Direct S3 URI
+                    logger.info(f"Retrieving image {image_uri}")
+                    image_bytes = s3.get_binary_content(image_uri)
+                else:
+                    raise ValueError(
+                        f"Invalid file path {image_uri} - expecting S3 path"
+                    )
+
+                converted_item = image.prepare_bedrock_image_attachment(image_bytes)
+            elif "image_base64" in item:
+                image_base64 = item["image_base64"]
+
+                # Decode image content
+                image_bytes = base64.b64decode(image_base64)
+
+                converted_item = image.prepare_bedrock_image_attachment(image_bytes)
+            elif "image" in item:
+                # Keep existing image objects as-is
+                converted_item = item.copy()
+            else:
+                # Keep non-image items as-is
+                converted_item = item.copy()
+
+            converted_content.append(converted_item)
+
+        return converted_content
+
     def _invoke_custom_prompt_lambda(
         self, lambda_arn: str, payload: dict[str, Any]
     ) -> dict[str, Any]:
@@ -486,6 +534,13 @@ def _invoke_custom_prompt_lambda(
                 logger.error(error_msg)
                 raise Exception(error_msg)
 
+            # Convert image URIs to bytes in the response
+            result["task_prompt_content"] = (
+                self._convert_image_uris_to_bytes_in_content(
+                    result["task_prompt_content"]
+                )
+            )
+
             return result
 
         except Exception as e:

diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml
@@ -1026,7 +1026,7 @@ Resources:
                 order: 7
               custom_prompt_lambda_arn:
                 type: string
-                description: "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor"
+                description: !Sub "(Optional) ARN of a Lambda function to generate custom extraction prompts. Function name must start with 'GENAIIDP-'. If not provided, default prompts will be used. The Lambda function receives the complete config, prompt placeholders, default task prompt content, and serialized document, and returns custom system_prompt and task_prompt_content. Example: arn:${AWS::Partition}:lambda:us-east-1:123456789012:function:GENAIIDP-my-extractor"
                 order: 8
           assessment:
             order: 5

diff --git a/plugins/dynamic-few-shot-lambda/.gitignore b/plugins/dynamic-few-shot-lambda/.gitignore
@@ -0,0 +1 @@
+datasets/