|
10 | 10 |
|
11 | 11 | from __future__ import annotations |
12 | 12 |
|
| 13 | +import base64 |
13 | 14 | import json |
14 | 15 | import logging |
15 | 16 | import os |
@@ -286,7 +287,7 @@ def _build_prompt_content( |
286 | 287 | ) |
287 | 288 |
|
288 | 289 | # Add few-shot examples |
289 | | - content.extend(self._build_few_shot_examples_content()) |
| 290 | + content.extend(self._build_few_shot_examples_content(image_content)) |
290 | 291 |
|
291 | 292 | # Process after examples (only pass images if not already used) |
292 | 293 | image_for_after = ( |
@@ -379,24 +380,84 @@ def _prepare_image_attachments(self, image_content: Any) -> list[dict[str, Any]] |
379 | 380 |
|
380 | 381 | return attachments |
381 | 382 |
|
382 | | - def _build_few_shot_examples_content(self) -> list[dict[str, Any]]: |
| 383 | + def _build_few_shot_examples_content( |
| 384 | + self, image_content: Any = None |
| 385 | + ) -> list[dict[str, Any]]: |
383 | 386 | """ |
384 | 387 | Build content items for few-shot examples from the configuration for a specific class. |
385 | 388 |
|
| 389 | + Args: |
| 390 | + image_content: Optional image content |
| 391 | +
|
386 | 392 | Returns: |
387 | 393 | List of content items containing text and image content for examples |
388 | 394 | """ |
389 | 395 | content: list[dict[str, Any]] = [] |
| 396 | + dynamic_few_shot_lambda_arn = self.config.extraction.dynamic_few_shot_lambda_arn |
| 397 | + |
| 398 | + if dynamic_few_shot_lambda_arn and dynamic_few_shot_lambda_arn.strip(): |
| 399 | + logger.info( |
| 400 | + f"Using Lambda to retrieve examples: {dynamic_few_shot_lambda_arn}" |
| 401 | + ) |
| 402 | + |
| 403 | + # Create fully serializable payload using comprehensive helper |
| 404 | + payload = { |
| 405 | + "class_label": self._class_label, |
| 406 | + "document_text": self._document_text, |
| 407 | + "image_content": image_content, |
| 408 | + } |
390 | 409 |
|
391 | | - # Use the stored class schema |
392 | | - if not self._class_schema: |
393 | | - logger.warning( |
394 | | - f"No class schema found for '{self._class_label}' for few-shot examples" |
| 410 | + # Test JSON serialization before sending to Lambda to catch any remaining issues |
| 411 | + try: |
| 412 | + json.dumps(payload) |
| 413 | + logger.info("Lambda payload successfully serialized") |
| 414 | + except (TypeError, ValueError) as e: |
| 415 | + logger.error( |
| 416 | + f"Lambda payload still contains non-serializable data: {e}" |
| 417 | + ) |
| 418 | + logger.info("Using comprehensive serialization as fallback") |
| 419 | + # Apply comprehensive serialization to entire payload |
| 420 | + payload = self._make_json_serializable(payload) |
| 421 | + try: |
| 422 | + json.dumps(payload) |
| 423 | + logger.info("Comprehensive serialization successful") |
| 424 | + except (TypeError, ValueError) as e2: |
| 425 | + logger.error(f"Even comprehensive serialization failed: {e2}") |
| 426 | + # Ultimate fallback to minimal payload |
| 427 | + payload = { |
| 428 | + "class_label": self._class_label, |
| 429 | + "document_text": self._document_text, |
| 430 | + } |
| 431 | + |
| 432 | + # Invoke dynamic few-shot Lambda and get result |
| 433 | + examples = self._invoke_dynamic_few_shot_lambda( |
| 434 | + dynamic_few_shot_lambda_arn, payload |
395 | 435 | ) |
396 | | - return content |
397 | 436 |
|
398 | | - # Get examples from the JSON Schema for this specific class |
399 | | - content = build_few_shot_extraction_examples_content(self._class_schema) |
| 437 | + logger.info(f"Retrieved {len(examples)} examples from few-shot Lambda invocation") |
| 438 | + |
| 439 | + # flatten examples content |
| 440 | + for example in examples: |
| 441 | + content.append({"text": example.get("attributes_prompt")}) |
| 442 | + for image_content in example.get("image_content", []): |
| 443 | + content.append(image.prepare_bedrock_image_attachment(image_content)) |
| 444 | + |
| 445 | + logger.info(content) |
| 446 | + |
| 447 | + else: |
| 448 | + logger.info( |
| 449 | + f"No dynamic few-shot Lambda configured - add examples from config for class: {self._class_label}" |
| 450 | + ) |
| 451 | + |
| 452 | + # Use the stored class schema |
| 453 | + if not self._class_schema: |
| 454 | + logger.warning( |
| 455 | + f"No class schema found for '{self._class_label}' for few-shot examples" |
| 456 | + ) |
| 457 | + return content |
| 458 | + |
| 459 | + # Get examples from the JSON Schema for this specific class |
| 460 | + content = build_few_shot_extraction_examples_content(self._class_schema) |
400 | 461 |
|
401 | 462 | return content |
402 | 463 |
|
@@ -427,8 +488,8 @@ def _make_json_serializable(self, obj: Any) -> Any: |
427 | 488 | # Handle objects with to_dict method |
428 | 489 | return self._make_json_serializable(obj.to_dict()) |
429 | 490 | elif isinstance(obj, bytes): |
430 | | - # Convert bytes to base64 string or placeholder |
431 | | - return f"<bytes_object_{len(obj)}_bytes>" |
| 491 | + # Convert bytes to base64 string |
| 492 | + return base64.b64encode(obj).decode("utf-8") |
432 | 493 | else: |
433 | 494 | try: |
434 | 495 | # Test if it's already JSON serializable |
@@ -498,6 +559,65 @@ def _invoke_custom_prompt_lambda( |
498 | 559 | logger.error(error_msg) |
499 | 560 | raise Exception(error_msg) |
500 | 561 |
|
| 562 | + def _invoke_dynamic_few_shot_lambda( |
| 563 | + self, lambda_arn: str, payload: dict |
| 564 | + ) -> list[dict[str, Any]]: |
| 565 | + """ |
| 566 | + Invoke dynamic few-shot Lambda function with JSON-serializable payload. |
| 567 | +
|
| 568 | + Args: |
| 569 | + lambda_arn: ARN of the Lambda function to invoke |
| 570 | + payload: Payload to send to Lambda function (must be JSON serializable) |
| 571 | +
|
| 572 | + Returns: |
| 573 | + Dict containing example content with images and expected attributes prompt |
| 574 | +
|
| 575 | + Raises: |
| 576 | + Exception: If Lambda invocation fails or returns invalid response |
| 577 | + """ |
| 578 | + import boto3 |
| 579 | + |
| 580 | + lambda_client = boto3.client("lambda", region_name=self.region) |
| 581 | + |
| 582 | + try: |
| 583 | + logger.info(f"Invoking dynamic few-shot Lambda: {lambda_arn}") |
| 584 | + response = lambda_client.invoke( |
| 585 | + FunctionName=lambda_arn, |
| 586 | + InvocationType="RequestResponse", |
| 587 | + Payload=json.dumps(payload), |
| 588 | + ) |
| 589 | + |
| 590 | + if response.get("FunctionError"): |
| 591 | + error_payload = response.get("Payload", b"").read().decode() |
| 592 | + error_msg = f"Dynamic few-shot Lambda failed: {error_payload}" |
| 593 | + logger.error(error_msg) |
| 594 | + raise Exception(error_msg) |
| 595 | + |
| 596 | + result = json.loads(response["Payload"].read()) |
| 597 | + logger.info("Dynamic few-shot Lambda invoked successfully") |
| 598 | + |
| 599 | + # Validate response structure |
| 600 | + if not isinstance(result, list): |
| 601 | + error_msg = f"Dynamic few-shot Lambda returned invalid response format: expected list, got {type(result)}" |
| 602 | + logger.error(error_msg) |
| 603 | + raise Exception(error_msg) |
| 604 | + |
| 605 | + # Base64 decode images |
| 606 | + for example in result: |
| 607 | + decoded_images = [] |
| 608 | + for image_base64 in example.get("image_content", []): |
| 609 | + image_data = base64.b64decode(image_base64) |
| 610 | + decoded_images.append(image_data) |
| 611 | + |
| 612 | + example["image_content"] = decoded_images |
| 613 | + |
| 614 | + return result |
| 615 | + |
| 616 | + except Exception as e: |
| 617 | + error_msg = f"Failed to invoke custom prompt Lambda {lambda_arn}: {str(e)}" |
| 618 | + logger.error(error_msg) |
| 619 | + raise Exception(error_msg) |
| 620 | + |
501 | 621 | def _reset_context(self) -> None: |
502 | 622 | """Reset instance variables for clean state before processing.""" |
503 | 623 | self._document_text = "" |
@@ -1078,7 +1198,6 @@ def process_document_section(self, document: Document, section_id: str) -> Docum |
1078 | 1198 | content, system_prompt = self._build_extraction_content( |
1079 | 1199 | document, page_images |
1080 | 1200 | ) |
1081 | | - |
1082 | 1201 | # Invoke model |
1083 | 1202 | result = self._invoke_extraction_model(content, system_prompt, section_info) |
1084 | 1203 |
|
|
0 commit comments