Skip to content

Commit cbd52d5

Browse files
committed
fixed pen test findings, fixed configuraiton loading mechanism, updated sanitized parmeters to ignore prints documents in content
1 parent b042caf commit cbd52d5

File tree

15 files changed

+299
-744
lines changed

15 files changed

+299
-744
lines changed

docs/discovery.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,9 @@ def discovery_with_fallback(discovery_service, document_key, ground_truth_key=No
12361236
**Discovery Output Format**
12371237
- Output format is configuration via View/Edit configuration. JSON format should follow custom classes format.
12381238
- Output in any other format will result in failure.
1239+
**Production Usage**
1240+
- We recommend not to use the Discovery module in production. This is to reduce the risk of any hallunication during the document discovery.
1241+
- We recommend to use discovery module in your lower environment to discovery and construct the configurations. Export the tested configuration to production deployment.
12391242

12401243

12411244
The Discovery module provides a powerful foundation for understanding and processing new document types. By following these guidelines and best practices, you can effectively leverage the module to bootstrap document processing workflows and continuously improve their accuracy and coverage.

lib/idp_common_pkg/idp_common/bedrock/client.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,9 @@ def _sanitize_messages_for_logging(self, messages: List[Dict[str, Any]]) -> List
858858
elif isinstance(content_item, dict) and 'bytes' in content_item:
859859
# Handle raw binary format
860860
content_item['bytes'] = '[binary_data]'
861+
elif isinstance(content_item, dict) and 'document' in content_item:
862+
# Handle different image format used by some models
863+
content_item['document'] = '[document_data]'
861864

862865
return sanitized
863866

lib/idp_common_pkg/idp_common/discovery/classes_discovery.py

Lines changed: 25 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: MIT-0
3-
import base64
43
import json
54
import logging
65
import os
@@ -10,6 +9,7 @@
109
from botocore.exceptions import ClientError
1110

1211
from idp_common import bedrock, image
12+
from idp_common.config import ConfigurationReader
1313
from idp_common.utils.s3util import S3Util
1414

1515
logger = logging.getLogger(__name__)
@@ -20,15 +20,28 @@ def __init__(
2020
self,
2121
input_bucket: str,
2222
input_prefix: str,
23-
config: Optional[dict] = None,
24-
region: Optional[str] = "us-west-2",
23+
region: Optional[str] = None,
2524
):
2625
self.input_bucket = input_bucket
2726
self.input_prefix = input_prefix
28-
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
27+
self.region = region or os.environ.get("AWS_REGION")
2928

3029
# Load configuration
31-
self.config = config or self._load_default_config()
30+
self.configuration_table_name = os.environ.get("CONFIGURATION_TABLE_NAME")
31+
32+
if not self.configuration_table_name:
33+
raise ValueError(
34+
"Configuration table name not provided. Set CONFIGURATION_TABLE_NAME environment variable."
35+
)
36+
37+
try:
38+
self.config_reader = ConfigurationReader(
39+
table_name=self.configuration_table_name
40+
)
41+
self.config = self.config_reader.get_merged_configuration()
42+
except Exception as e:
43+
logger.error(f"Failed to load configuration from DynamoDB: {e}")
44+
raise Exception(f"Failed to load configuration from DynamoDB: {str(e)}")
3245

3346
# Get discovery configuration
3447
self.discovery_config = self.config.get("discovery", {})
@@ -40,87 +53,14 @@ def __init__(
4053
# Initialize Bedrock client using the common pattern
4154
self.bedrock_client = bedrock.BedrockClient(region=self.region)
4255

43-
self.configuration_table_name = os.environ.get("CONFIGURATION_TABLE_NAME", "")
44-
dynamodb = boto3.resource("dynamodb")
45-
self.configuration_table = dynamodb.Table(self.configuration_table_name)
56+
if self.configuration_table_name:
57+
dynamodb = boto3.resource("dynamodb")
58+
self.configuration_table = dynamodb.Table(self.configuration_table_name)
59+
else:
60+
self.configuration_table = None
4661

4762
return
4863

49-
def _load_default_config(self):
50-
"""Load default discovery configuration."""
51-
return {
52-
"discovery": {
53-
"without_ground_truth": {
54-
"model_id": "anthropic.claude-3-sonnet-20240229-v1:0",
55-
"temperature": 1.0,
56-
"top_p": 0.1,
57-
"max_tokens": 10000,
58-
"system_prompt": "You are an expert in processing forms. Extracting data from images and documents. Analyze forms line by line to identify field names, data types, and organizational structure. Focus on creating comprehensive blueprints for document processing without extracting actual values.",
59-
"user_prompt": """This image contains forms data. Analyze the form line by line.
60-
Image may contains multiple pages, process all the pages.
61-
Form may contain multiple name value pair in one line.
62-
Extract all the names in the form including the name value pair which doesn't have value.
63-
Organize them into groups, extract field_name, data_type and field description.
64-
Field_name should be less than 60 characters, should not have space use '-' instead of space.
65-
field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
66-
Field_name should be unique within the group.
67-
Add two fields document_class and document_description.
68-
For document_class generate a short name based on the document content like W4, I-9, Paystub.
69-
For document_description generate a description about the document in less than 50 words.
70-
Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
71-
If the group repeats and follows table format, update the attributeType as "list".
72-
Do not extract the values.
73-
Return the extracted data in JSON format.""",
74-
},
75-
"with_ground_truth": {
76-
"model_id": "anthropic.claude-3-sonnet-20240229-v1:0",
77-
"temperature": 1.0,
78-
"top_p": 0.1,
79-
"max_tokens": 10000,
80-
"system_prompt": "You are an expert in processing forms. Extracting data from images and documents. Use provided ground truth data as reference to optimize field extraction and ensure consistency with expected document structure and field definitions.",
81-
"user_prompt": """This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.
82-
<GROUND_TRUTH_REFERENCE>
83-
{ground_truth_json}
84-
</GROUND_TRUTH_REFERENCE>
85-
Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
86-
Image may contain multiple pages, process all pages.
87-
Extract all field names including those without values.
88-
Do not change the group name and field name from ground truth in the extracted data json.
89-
Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field.
90-
Add two fields document_class and document_description.
91-
For document_class generate a short name based on the document content like W4, I-9, Paystub.
92-
For document_description generate a description about the document in less than 50 words.
93-
If the group repeats and follows table format, update the attributeType as "list".
94-
Do not extract the values.""",
95-
},
96-
"output_format": {
97-
"sample_json": """{
98-
"document_class" : "Form-1040",
99-
"document_description" : "Brief summary of the document",
100-
"groups" : [
101-
{
102-
"name" : "PersonalInformation",
103-
"description" : "Personal information of Tax payer",
104-
"attributeType" : "group",
105-
"groupAttributes" : [
106-
{
107-
"name": "FirstName",
108-
"dataType" : "string",
109-
"description" : "First Name of Taxpayer"
110-
},
111-
{
112-
"name": "Age",
113-
"dataType" : "number",
114-
"description" : "Age of Taxpayer"
115-
}
116-
]
117-
}
118-
]
119-
}"""
120-
},
121-
}
122-
}
123-
12464
"""
12565
Recursively convert all values to strings
12666
"""
@@ -328,13 +268,6 @@ def _remove_duplicates(self, groups):
328268
group["groupAttributes"] = groupAttributesArray
329269
return groups
330270

331-
def _parse_s3_uri(self, s3_uri: str):
332-
"""Parse S3 URI to extract bucket and key."""
333-
if not s3_uri.startswith("s3://"):
334-
raise ValueError("Invalid S3 URI format")
335-
parts = s3_uri[5:].split("/", 1)
336-
return parts[0], parts[1] if len(parts) > 1 else ""
337-
338271
def _load_ground_truth(self, bucket: str, key: str):
339272
"""Load ground truth JSON data from S3."""
340273
try:
@@ -347,9 +280,7 @@ def _load_ground_truth(self, bucket: str, key: str):
347280
def _extract_data_from_document(self, document_content, file_extension):
348281
try:
349282
# Get configuration for without ground truth
350-
model_id = self.without_gt_config.get(
351-
"model_id", "anthropic.claude-3-sonnet-20240229-v1:0"
352-
)
283+
model_id = self.without_gt_config.get("model_id", "us.amazon.nova-pro-v1:0")
353284
system_prompt = self.without_gt_config.get(
354285
"system_prompt",
355286
"You are an expert in processing forms. Extracting data from images and documents",
@@ -424,9 +355,7 @@ def _extract_data_from_document_with_ground_truth(
424355
"""Extract data from document using ground truth as reference."""
425356
try:
426357
# Get configuration for with ground truth
427-
model_id = self.with_gt_config.get(
428-
"model_id", "anthropic.claude-3-sonnet-20240229-v1:0"
429-
)
358+
model_id = self.with_gt_config.get("model_id", "us.amazon.nova-pro-v1:0")
430359
system_prompt = self.with_gt_config.get(
431360
"system_prompt",
432361
"You are an expert in processing forms. Extracting data from images and documents",
@@ -483,10 +412,6 @@ def _extract_data_from_document_with_ground_truth(
483412
logger.error(f"Error extracting data with Bedrock using ground truth: {e}")
484413
return None
485414

486-
def _get_base64_image(self, file_in_bytes):
487-
"""Get base64 encoded image data from sample image."""
488-
return base64.b64encode(file_in_bytes).decode("utf-8")
489-
490415
def _prompt_classes_discovery_with_ground_truth(self, ground_truth_data):
491416
ground_truth_json = json.dumps(ground_truth_data, indent=2)
492417
sample_output_format = self._sample_output_format()

lib/idp_common_pkg/tests/integration/test_discovery_config_integration.py

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,22 @@ def test_end_to_end_config_flow_without_ground_truth(
113113
mock_extract_text.return_value = json.dumps(expected_result)
114114
mock_s3_get_bytes.return_value = b"mock document content"
115115

116-
# Initialize ClassesDiscovery with YAML config
117-
discovery = ClassesDiscovery(
118-
input_bucket=self.test_bucket,
119-
input_prefix=self.test_prefix,
120-
config=self.config_dict,
121-
region=self.test_region,
122-
)
116+
# Mock ConfigurationReader to return config_dict
117+
with patch(
118+
"idp_common.discovery.classes_discovery.ConfigurationReader"
119+
) as mock_config_reader:
120+
mock_reader_instance = mock_config_reader.return_value
121+
mock_reader_instance.get_merged_configuration.return_value = (
122+
self.config_dict
123+
)
124+
125+
with patch.dict("os.environ", {"CONFIGURATION_TABLE_NAME": "test-table"}):
126+
# Initialize ClassesDiscovery with YAML config
127+
discovery = ClassesDiscovery(
128+
input_bucket=self.test_bucket,
129+
input_prefix=self.test_prefix,
130+
region=self.test_region,
131+
)
123132

124133
# Execute discovery without ground truth
125134
result = discovery.discovery_classes_with_document(
@@ -218,13 +227,22 @@ def mock_s3_side_effect(bucket, key):
218227

219228
mock_s3_get_bytes.side_effect = mock_s3_side_effect
220229

221-
# Initialize ClassesDiscovery with YAML config
222-
discovery = ClassesDiscovery(
223-
input_bucket=self.test_bucket,
224-
input_prefix=self.test_prefix,
225-
config=self.config_dict,
226-
region=self.test_region,
227-
)
230+
# Mock ConfigurationReader to return config_dict
231+
with patch(
232+
"idp_common.discovery.classes_discovery.ConfigurationReader"
233+
) as mock_config_reader:
234+
mock_reader_instance = mock_config_reader.return_value
235+
mock_reader_instance.get_merged_configuration.return_value = (
236+
self.config_dict
237+
)
238+
239+
with patch.dict("os.environ", {"CONFIGURATION_TABLE_NAME": "test-table"}):
240+
# Initialize ClassesDiscovery with YAML config
241+
discovery = ClassesDiscovery(
242+
input_bucket=self.test_bucket,
243+
input_prefix=self.test_prefix,
244+
region=self.test_region,
245+
)
228246

229247
# Execute discovery with ground truth
230248
result = discovery.discovery_classes_with_document_and_ground_truth(
@@ -290,13 +308,22 @@ def test_config_validation_and_defaults(
290308
}
291309
}
292310

293-
# Initialize with incomplete config
294-
discovery = ClassesDiscovery(
295-
input_bucket=self.test_bucket,
296-
input_prefix=self.test_prefix,
297-
config=incomplete_config,
298-
region=self.test_region,
299-
)
311+
# Mock ConfigurationReader to return incomplete config
312+
with patch(
313+
"idp_common.discovery.classes_discovery.ConfigurationReader"
314+
) as mock_config_reader:
315+
mock_reader_instance = mock_config_reader.return_value
316+
mock_reader_instance.get_merged_configuration.return_value = (
317+
incomplete_config
318+
)
319+
320+
with patch.dict("os.environ", {"CONFIGURATION_TABLE_NAME": "test-table"}):
321+
# Initialize with incomplete config
322+
discovery = ClassesDiscovery(
323+
input_bucket=self.test_bucket,
324+
input_prefix=self.test_prefix,
325+
region=self.test_region,
326+
)
300327

301328
# Verify that missing fields get default values
302329
without_gt_config = discovery.without_gt_config

0 commit comments

Comments
 (0)