Skip to content

Commit 06a94d9

Browse files
author
Daniel Lorch
committed
feat: add dynamic few-shot Lambda invocation to ExtractionService
1 parent cb10ebc commit 06a94d9

File tree

1 file changed

+131
-12
lines changed
  • lib/idp_common_pkg/idp_common/extraction

1 file changed

+131
-12
lines changed

lib/idp_common_pkg/idp_common/extraction/service.py

Lines changed: 131 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from __future__ import annotations
1212

13+
import base64
1314
import json
1415
import logging
1516
import os
@@ -286,7 +287,7 @@ def _build_prompt_content(
286287
)
287288

288289
# Add few-shot examples
289-
content.extend(self._build_few_shot_examples_content())
290+
content.extend(self._build_few_shot_examples_content(image_content))
290291

291292
# Process after examples (only pass images if not already used)
292293
image_for_after = (
@@ -379,24 +380,84 @@ def _prepare_image_attachments(self, image_content: Any) -> list[dict[str, Any]]
379380

380381
return attachments
381382

382-
def _build_few_shot_examples_content(self) -> list[dict[str, Any]]:
383+
def _build_few_shot_examples_content(
384+
self, image_content: Any = None
385+
) -> list[dict[str, Any]]:
383386
"""
384387
Build content items for few-shot examples from the configuration for a specific class.
385388
389+
Args:
390+
image_content: Optional image content
391+
386392
Returns:
387393
List of content items containing text and image content for examples
388394
"""
389395
content: list[dict[str, Any]] = []
396+
dynamic_few_shot_lambda_arn = self.config.extraction.dynamic_few_shot_lambda_arn
397+
398+
if dynamic_few_shot_lambda_arn and dynamic_few_shot_lambda_arn.strip():
399+
logger.info(
400+
f"Using Lambda to retrieve examples: {dynamic_few_shot_lambda_arn}"
401+
)
402+
403+
# Create fully serializable payload using comprehensive helper
404+
payload = {
405+
"class_label": self._class_label,
406+
"document_text": self._document_text,
407+
"image_content": image_content,
408+
}
390409

391-
# Use the stored class schema
392-
if not self._class_schema:
393-
logger.warning(
394-
f"No class schema found for '{self._class_label}' for few-shot examples"
410+
# Test JSON serialization before sending to Lambda to catch any remaining issues
411+
try:
412+
json.dumps(payload)
413+
logger.info("Lambda payload successfully serialized")
414+
except (TypeError, ValueError) as e:
415+
logger.error(
416+
f"Lambda payload still contains non-serializable data: {e}"
417+
)
418+
logger.info("Using comprehensive serialization as fallback")
419+
# Apply comprehensive serialization to entire payload
420+
payload = self._make_json_serializable(payload)
421+
try:
422+
json.dumps(payload)
423+
logger.info("Comprehensive serialization successful")
424+
except (TypeError, ValueError) as e2:
425+
logger.error(f"Even comprehensive serialization failed: {e2}")
426+
# Ultimate fallback to minimal payload
427+
payload = {
428+
"class_label": self._class_label,
429+
"document_text": self._document_text,
430+
}
431+
432+
# Invoke dynamic few-shot Lambda and get result
433+
examples = self._invoke_dynamic_few_shot_lambda(
434+
dynamic_few_shot_lambda_arn, payload
395435
)
396-
return content
397436

398-
# Get examples from the JSON Schema for this specific class
399-
content = build_few_shot_extraction_examples_content(self._class_schema)
437+
logger.info(f"Retrieved {len(examples)} examples from few-shot Lambda invocation")
438+
439+
# flatten examples content
440+
for example in examples:
441+
content.append({"text": example.get("attributes_prompt")})
442+
for image_content in example.get("image_content", []):
443+
content.append(image.prepare_bedrock_image_attachment(image_content))
444+
445+
logger.info(content)
446+
447+
else:
448+
logger.info(
449+
f"No dynamic few-shot Lambda configured - add examples from config for class: {self._class_label}"
450+
)
451+
452+
# Use the stored class schema
453+
if not self._class_schema:
454+
logger.warning(
455+
f"No class schema found for '{self._class_label}' for few-shot examples"
456+
)
457+
return content
458+
459+
# Get examples from the JSON Schema for this specific class
460+
content = build_few_shot_extraction_examples_content(self._class_schema)
400461

401462
return content
402463

@@ -427,8 +488,8 @@ def _make_json_serializable(self, obj: Any) -> Any:
427488
# Handle objects with to_dict method
428489
return self._make_json_serializable(obj.to_dict())
429490
elif isinstance(obj, bytes):
430-
# Convert bytes to base64 string or placeholder
431-
return f"<bytes_object_{len(obj)}_bytes>"
491+
# Convert bytes to base64 string
492+
return base64.b64encode(obj).decode("utf-8")
432493
else:
433494
try:
434495
# Test if it's already JSON serializable
@@ -498,6 +559,65 @@ def _invoke_custom_prompt_lambda(
498559
logger.error(error_msg)
499560
raise Exception(error_msg)
500561

562+
def _invoke_dynamic_few_shot_lambda(
563+
self, lambda_arn: str, payload: dict
564+
) -> list[dict[str, Any]]:
565+
"""
566+
Invoke dynamic few-shot Lambda function with JSON-serializable payload.
567+
568+
Args:
569+
lambda_arn: ARN of the Lambda function to invoke
570+
payload: Payload to send to Lambda function (must be JSON serializable)
571+
572+
Returns:
573+
Dict containing example content with images and expected attributes prompt
574+
575+
Raises:
576+
Exception: If Lambda invocation fails or returns invalid response
577+
"""
578+
import boto3
579+
580+
lambda_client = boto3.client("lambda", region_name=self.region)
581+
582+
try:
583+
logger.info(f"Invoking dynamic few-shot Lambda: {lambda_arn}")
584+
response = lambda_client.invoke(
585+
FunctionName=lambda_arn,
586+
InvocationType="RequestResponse",
587+
Payload=json.dumps(payload),
588+
)
589+
590+
if response.get("FunctionError"):
591+
error_payload = response.get("Payload", b"").read().decode()
592+
error_msg = f"Dynamic few-shot Lambda failed: {error_payload}"
593+
logger.error(error_msg)
594+
raise Exception(error_msg)
595+
596+
result = json.loads(response["Payload"].read())
597+
logger.info("Dynamic few-shot Lambda invoked successfully")
598+
599+
# Validate response structure
600+
if not isinstance(result, list):
601+
error_msg = f"Dynamic few-shot Lambda returned invalid response format: expected list, got {type(result)}"
602+
logger.error(error_msg)
603+
raise Exception(error_msg)
604+
605+
# Base64 decode images
606+
for example in result:
607+
decoded_images = []
608+
for image_base64 in example.get("image_content", []):
609+
image_data = base64.b64decode(image_base64)
610+
decoded_images.append(image_data)
611+
612+
example["image_content"] = decoded_images
613+
614+
return result
615+
616+
except Exception as e:
617+
error_msg = f"Failed to invoke custom prompt Lambda {lambda_arn}: {str(e)}"
618+
logger.error(error_msg)
619+
raise Exception(error_msg)
620+
501621
def _reset_context(self) -> None:
502622
"""Reset instance variables for clean state before processing."""
503623
self._document_text = ""
@@ -1078,7 +1198,6 @@ def process_document_section(self, document: Document, section_id: str) -> Docum
10781198
content, system_prompt = self._build_extraction_content(
10791199
document, page_images
10801200
)
1081-
10821201
# Invoke model
10831202
result = self._invoke_extraction_model(content, system_prompt, section_info)
10841203

0 commit comments

Comments
 (0)