Skip to content

Commit d793ac0

Browse files
author
Bob Strahan
committed
align notebook logic, and gitignore temp data created by notebooks
1 parent 735d2c4 commit d793ac0

15 files changed

+268
-85
lines changed
Lines changed: 132 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,134 @@
11
# Assessment Service Configuration
22
assessment:
3-
default_confidence_threshold: "0.9"
4-
top_p: "0.1"
5-
max_tokens: "4096"
6-
top_k: "5"
7-
temperature: "0.0"
8-
model: "us.amazon.nova-pro-v1:0"
9-
system_prompt: "You are a document analysis assessment expert. Your task is to evaluate the confidence and accuracy of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores and reasoning for each extracted attribute."
10-
task_prompt: "<background>\nYou are an expert document analysis assessment system. Your task is to evaluate the confidence and accuracy of extraction results for a document of class {DOCUMENT_CLASS}.\n</background>\n\n<task>\nAnalyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:\n1. Text clarity and OCR quality in the source regions 2. Alignment between extracted values and document content 3. Presence of clear evidence supporting the extraction 4. Potential ambiguity or uncertainty in the source material 5. Completeness and accuracy of the extracted information\n</task>\n\n<assessment-guidelines>\nFor each attribute, provide: 1. A confidence score between 0.0 and 1.0 where:\n - 1.0 = Very high confidence, clear and unambiguous evidence\n - 0.8-0.9 = High confidence, strong evidence with minor uncertainty\n - 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity\n - 0.4-0.5 = Low confidence, weak or unclear evidence\n - 0.0-0.3 = Very low confidence, little to no supporting evidence\n\n2. A clear reason explaining the confidence score, including:\n - What evidence supports or contradicts the extraction\n - Any OCR quality issues that affect confidence\n - Clarity of the source document in relevant areas\n - Any ambiguity or uncertainty factors\n\nGuidelines: - Base assessments on actual document content and OCR quality - Consider both text-based evidence and visual/layout clues - Account for OCR confidence scores when provided - Be objective and specific in reasoning - If an extraction appears incorrect, score accordingly with explanation\n</assessment-guidelines>\n<attributes-definitions>\n{ATTRIBUTE_NAMES_AND_DESCRIPTIONS}\n</attributes-definitions>\n\n<<CACHEPOINT>>\n\n<extraction-results>\n{EXTRACTION_RESULTS}\n</extraction-results>\n\n<document-image>\n{DOCUMENT_IMAGE}\n</document-image>\n\n<ocr-text-confidence-results>\n{OCR_TEXT_CONFIDENCE}\n</ocr-text-confidence-results>\n\n<final-instructions>\nAnalyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure:\n\n {\n \"attribute_name_1\": {\n \"confidence_score\": 0.85,\n \"confidence_reason\": \"Clear text evidence found in document header with high OCR confidence (0.98). Value matches exactly.\"\n },\n \"attribute_name_2\": {\n \"confidence_score\": 0.65,\n \"confidence_reason\": \"Text is partially unclear due to poor scan quality. OCR confidence low (0.72) in this region.\"\n }\n }\n\nInclude assessments for ALL attributes present in the extraction results.\n</final-instructions>"
3+
default_confidence_threshold: '0.9'
4+
top_p: '0.1'
5+
max_tokens: '10000'
6+
top_k: '5'
7+
temperature: '0.0'
8+
model: us.anthropic.claude-3-7-sonnet-20250219-v1:0
9+
system_prompt: >-
10+
You are a document analysis assessment expert. Your task is to evaluate the confidence of extraction results by analyzing the source document evidence. Respond only with JSON containing confidence scores for each extracted attribute.
11+
task_prompt: >-
12+
<background>
13+
14+
You are an expert document analysis assessment system. Your task is to evaluate the confidence of extraction results for a document of class {DOCUMENT_CLASS}.
15+
16+
</background>
17+
18+
19+
<task>
20+
21+
Analyze the extraction results against the source document and provide confidence assessments for each extracted attribute. Consider factors such as:
22+
23+
1. Text clarity and OCR quality in the source regions
24+
2. Alignment between extracted values and document content
25+
3. Presence of clear evidence supporting the extraction
26+
4. Potential ambiguity or uncertainty in the source material
27+
5. Completeness and accuracy of the extracted information
28+
29+
</task>
30+
31+
32+
<assessment-guidelines>
33+
34+
For each attribute, provide:
35+
A confidence score between 0.0 and 1.0 where:
36+
- 1.0 = Very high confidence, clear and unambiguous evidence
37+
- 0.8-0.9 = High confidence, strong evidence with minor uncertainty
38+
- 0.6-0.7 = Medium confidence, reasonable evidence but some ambiguity
39+
- 0.4-0.5 = Low confidence, weak or unclear evidence
40+
- 0.0-0.3 = Very low confidence, little to no supporting evidence
41+
42+
Guidelines:
43+
- Base assessments on actual document content and OCR quality
44+
- Consider both text-based evidence and visual/layout clues
45+
- Account for OCR confidence scores when provided
46+
- Be objective and specific in reasoning
47+
- If an extraction appears incorrect, score accordingly with explanation
48+
49+
</assessment-guidelines>
50+
51+
<attributes-definitions>
52+
53+
{ATTRIBUTE_NAMES_AND_DESCRIPTIONS}
54+
55+
</attributes-definitions>
56+
57+
58+
<<CACHEPOINT>>
59+
60+
61+
<extraction-results>
62+
63+
{EXTRACTION_RESULTS}
64+
65+
</extraction-results>
66+
67+
68+
<document-image>
69+
70+
{DOCUMENT_IMAGE}
71+
72+
</document-image>
73+
74+
75+
<ocr-text-confidence-results>
76+
77+
{OCR_TEXT_CONFIDENCE}
78+
79+
</ocr-text-confidence-results>
80+
81+
82+
<final-instructions>
83+
84+
Analyze the extraction results against the source document and provide confidence assessments. Return a JSON object with the following structure based on the attribute type:
85+
86+
For SIMPLE attributes:
87+
{
88+
"simple_attribute_name": {
89+
"confidence": 0.85
90+
}
91+
}
92+
93+
For GROUP attributes (nested object structure):
94+
{
95+
"group_attribute_name": {
96+
"sub_attribute_1": {
97+
"confidence": 0.90
98+
},
99+
"sub_attribute_2": {
100+
"confidence": 0.75
101+
}
102+
}
103+
}
104+
105+
For LIST attributes (array of assessed items):
106+
{
107+
"list_attribute_name": [
108+
{
109+
"item_attribute_1": {
110+
"confidence": 0.95
111+
},
112+
"item_attribute_2": {
113+
"confidence": 0.88
114+
}
115+
},
116+
{
117+
"item_attribute_1": {
118+
"confidence": 0.92
119+
},
120+
"item_attribute_2": {
121+
"confidence": 0.70
122+
}
123+
}
124+
]
125+
}
126+
127+
IMPORTANT:
128+
- For LIST attributes like "Transactions", assess EACH individual item in the list separately
129+
- Each transaction should be assessed as a separate object in the array
130+
- Do NOT provide aggregate assessments for list items - assess each one individually
131+
- Include assessments for ALL attributes present in the extraction results
132+
- Match the exact structure of the extracted data
133+
134+
</final-instructions>

notebooks/examples/step0_setup.ipynb

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@
2222
"## 1. Install Dependencies"
2323
]
2424
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": null,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"ROOTDIR=\"../..\"\n",
32+
"SAMPLE_PDF_PATH = f\"{ROOTDIR}/samples/rvl_cdip_package.pdf\""
33+
]
34+
},
2535
{
2636
"cell_type": "code",
2737
"execution_count": null,
@@ -32,7 +42,6 @@
3242
"%load_ext autoreload\n",
3343
"%autoreload 2\n",
3444
"\n",
35-
"ROOTDIR=\"../..\"\n",
3645
"# First uninstall existing package (to ensure we get the latest version)\n",
3746
"%pip uninstall -y idp_common\n",
3847
"\n",
@@ -144,9 +153,6 @@
144153
"account_id = sts_client.get_caller_identity()[\"Account\"]\n",
145154
"region = os.environ['AWS_REGION']\n",
146155
"\n",
147-
"# Define sample PDF path \n",
148-
"SAMPLE_PDF_PATH = f\"{ROOTDIR}/samples/rvl_cdip_package.pdf\"\n",
149-
"\n",
150156
"# Create unique bucket names based on account ID and region\n",
151157
"input_bucket_name = os.getenv(\"IDP_INPUT_BUCKET_NAME\", f\"idp-modular-input-{account_id}-{region}\")\n",
152158
"output_bucket_name = os.getenv(\"IDP_OUTPUT_BUCKET_NAME\", f\"idp-modular-output-{account_id}-{region}\")\n",
@@ -225,7 +231,7 @@
225231
"source": [
226232
"# Initialize a new Document\n",
227233
"document = Document(\n",
228-
" id=\"modular-pipeline-document\",\n",
234+
" id=\"bank_statement\",\n",
229235
" input_bucket=input_bucket_name,\n",
230236
" input_key=sample_file_key,\n",
231237
" output_bucket=output_bucket_name,\n",
@@ -252,7 +258,7 @@
252258
"outputs": [],
253259
"source": [
254260
"# Create data directory if it doesn't exist\n",
255-
"data_dir = Path(\"data/step0_setup\")\n",
261+
"data_dir = Path(\".data/step0_setup\")\n",
256262
"data_dir.mkdir(parents=True, exist_ok=True)\n",
257263
"\n",
258264
"# Save document object as JSON\n",
@@ -301,7 +307,7 @@
301307
"print(f\"✅ Configuration loaded: {len(CONFIG)} sections\")\n",
302308
"print(f\"✅ S3 buckets ready: {input_bucket_name}, {output_bucket_name}\")\n",
303309
"print(f\"✅ Sample file uploaded: {sample_file_key}\")\n",
304-
"print(f\"✅ Data saved to: data/step0_setup/\")\n",
310+
"print(f\"✅ Data saved to: .data/step0_setup/\")\n",
305311
"print(\"\\n📌 Next step: Run step1_ocr.ipynb\")"
306312
]
307313
}

notebooks/examples/step1_ocr.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
"outputs": [],
5656
"source": [
5757
"# Load document from previous step\n",
58-
"setup_data_dir = Path(\"data/step0_setup\")\n",
58+
"setup_data_dir = Path(\".data/step0_setup\")\n",
5959
"\n",
6060
"# Load document object from JSON\n",
6161
"document_path = setup_data_dir / \"document.json\"\n",
@@ -69,7 +69,7 @@
6969
"\n",
7070
"# Load each configuration file\n",
7171
"config_files = [\n",
72-
" \"ocr.yaml\"\n",
72+
" \"ocr.yaml\",\n",
7373
"]\n",
7474
"\n",
7575
"for config_file in config_files:\n",
@@ -193,7 +193,7 @@
193193
"outputs": [],
194194
"source": [
195195
"# Create data directory for this step\n",
196-
"data_dir = Path(\"data/step1_ocr\")\n",
196+
"data_dir = Path(\".data/step1_ocr\")\n",
197197
"data_dir.mkdir(parents=True, exist_ok=True)\n",
198198
"\n",
199199
"# Save updated document object as JSON\n",
@@ -234,7 +234,7 @@
234234
"print(f\"✅ Pages extracted: {document.num_pages}\")\n",
235235
"print(f\"✅ Processing time: {ocr_time:.2f} seconds\")\n",
236236
"print(f\"✅ Features used: {', '.join(features)}\")\n",
237-
"print(f\"✅ Data saved to: data/step1_ocr/\")\n",
237+
"print(f\"✅ Data saved to: .data/step1_ocr/\")\n",
238238
"print(\"\\n📌 Next step: Run step2_classification.ipynb\")"
239239
]
240240
}

notebooks/examples/step2_classification.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
"outputs": [],
5858
"source": [
5959
"# Load document from previous step\n",
60-
"ocr_data_dir = Path(\"data/step1_ocr\")\n",
60+
"ocr_data_dir = Path(\".data/step1_ocr\")\n",
6161
"\n",
6262
"# Load document object from JSON\n",
6363
"document_path = ocr_data_dir / \"document.json\"\n",
@@ -222,7 +222,7 @@
222222
"outputs": [],
223223
"source": [
224224
"# Create data directory for this step\n",
225-
"data_dir = Path(\"data/step2_classification\")\n",
225+
"data_dir = Path(\".data/step2_classification\")\n",
226226
"data_dir.mkdir(parents=True, exist_ok=True)\n",
227227
"\n",
228228
"# Save updated document object as JSON\n",
@@ -288,7 +288,7 @@
288288
"print(f\"✅ Processing time: {classification_time:.2f} seconds\")\n",
289289
"print(f\"✅ Method used: {classification_config.get('classificationMethod')}\")\n",
290290
"print(f\"✅ Model used: {classification_config.get('model')}\")\n",
291-
"print(f\"✅ Data saved to: data/step2_classification/\")\n",
291+
"print(f\"✅ Data saved to: .data/step2_classification/\")\n",
292292
"print(\"\\n📌 Next step: Run step3_extraction.ipynb\")"
293293
]
294294
}

notebooks/examples/step3_extraction.ipynb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
"outputs": [],
5858
"source": [
5959
"# Load document from previous step\n",
60-
"classification_data_dir = Path(\"data/step2_classification\")\n",
60+
"classification_data_dir = Path(\".data/step2_classification\")\n",
6161
"\n",
6262
"# Load document object from JSON\n",
6363
"document_path = classification_data_dir / \"document.json\"\n",
@@ -120,11 +120,15 @@
120120
"print(f\"Temperature: {extraction_config.get('temperature')}\")\n",
121121
"print(f\"Max Tokens: {extraction_config.get('max_tokens')}\")\n",
122122
"print(\"*\"*50)\n",
123+
"\n",
123124
"print(f\"System Prompt:\\n{extraction_config.get('system_prompt')}\")\n",
124125
"print(\"*\"*50)\n",
125126
"print(f\"Task Prompt:\\n{extraction_config.get('task_prompt')}\")\n",
126127
"print(\"*\"*50)\n",
127128
"\n",
129+
"\n",
130+
"\n",
131+
"\n",
128132
"# Display available document classes and their attributes\n",
129133
"classes = CONFIG.get('classes', [])\n",
130134
"print(f\"\\nDocument Classes and Attributes:\")\n",
@@ -255,7 +259,7 @@
255259
" for attr_name, attr_value in inference_result.items():\n",
256260
" if attr_value is not None:\n",
257261
" # Truncate long values for display\n",
258-
" display_value = str(attr_value)[:100] + \"...\" if len(str(attr_value)) > 100 else attr_value\n",
262+
" display_value = str(attr_value)[:1000] + \"...\" if len(str(attr_value)) > 1000 else attr_value\n",
259263
" print(f\" {attr_name}: {display_value}\")\n",
260264
" else:\n",
261265
" print(f\" {attr_name}: null\")\n",
@@ -289,7 +293,7 @@
289293
"outputs": [],
290294
"source": [
291295
"# Create data directory for this step\n",
292-
"data_dir = Path(\"data/step3_extraction\")\n",
296+
"data_dir = Path(\".data/step3_extraction\")\n",
293297
"data_dir.mkdir(parents=True, exist_ok=True)\n",
294298
"\n",
295299
"# Save updated document object as JSON\n",
@@ -354,7 +358,7 @@
354358
"print(f\"✅ Sections processed: {sections_processed} of {len(document.sections) if document.sections else 0}\")\n",
355359
"print(f\"✅ Sections with results: {sections_with_results}\")\n",
356360
"print(f\"✅ Model used: {extraction_config.get('model')}\")\n",
357-
"print(f\"✅ Data saved to: data/step3_extraction/\")\n",
361+
"print(f\"✅ Data saved to: .data/step3_extraction/\")\n",
358362
"print(\"\\n📌 Next step: Run step4_assessment.ipynb\")"
359363
]
360364
}

0 commit comments

Comments
 (0)