Skip to content

Commit 53bb75e

Browse files
author
Bob Strahan
committed
Add benchmark utilities for baseline data preparation
1 parent 0f9b8cb commit 53bb75e

File tree

4 files changed

+830
-9
lines changed

4 files changed

+830
-9
lines changed

lib/idp_common_pkg/idp_common/evaluation/service.py

Lines changed: 87 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -378,17 +378,95 @@ def _get_stickler_model(
378378
schema = stickler_config["schema"]
379379
model_name = stickler_config["model_name"]
380380

381-
logger.info(f"Creating Stickler model for class: {document_class}")
382-
383-
# Use JsonSchemaFieldConverter to handle the full JSON Schema natively
384-
from stickler.structured_object_evaluator.models.json_schema_field_converter import (
385-
JsonSchemaFieldConverter,
381+
# Enhanced logging: Log schema details before creating model
382+
logger.info(
383+
f"Creating Stickler model for class: {document_class}\n"
384+
f" Schema summary:\n"
385+
f" - Properties: {list(schema.get('properties', {}).keys())}\n"
386+
f" - Required fields: {schema.get('required', [])}\n"
387+
f" - Schema ID: {schema.get('$id', 'N/A')}\n"
388+
f" - Model name: {model_name}"
386389
)
387390

388-
converter = JsonSchemaFieldConverter(schema)
389-
field_definitions = converter.convert_properties_to_fields(
390-
schema.get("properties", {}), schema.get("required", [])
391-
)
391+
# Log expected and actual data structure for troubleshooting
392+
if expected_data:
393+
logger.info(
394+
f" Expected data keys for {document_class}: {list(expected_data.keys())}"
395+
)
396+
397+
try:
398+
# Use JsonSchemaFieldConverter to handle the full JSON Schema natively
399+
from stickler.structured_object_evaluator.models.json_schema_field_converter import (
400+
JsonSchemaFieldConverter,
401+
)
402+
403+
logger.debug(f"Converting schema properties for {document_class}")
404+
405+
converter = JsonSchemaFieldConverter(schema)
406+
field_definitions = converter.convert_properties_to_fields(
407+
schema.get("properties", {}), schema.get("required", [])
408+
)
409+
410+
logger.info(
411+
f"Successfully converted schema for {document_class} with {len(field_definitions)} fields"
412+
)
413+
414+
except Exception as e:
415+
# Enhanced error handling with user guidance
416+
import json
417+
import re
418+
419+
error_message = str(e)
420+
421+
# Check if it's a JSON Schema validation error
422+
if (
423+
"jsonschema.exceptions.SchemaError" in str(type(e))
424+
or "Invalid JSON Schema" in error_message
425+
):
426+
# Try to extract the problematic field from the error
427+
field_match = re.search(
428+
r"On schema\['properties'\]\['([^']+)'\]", error_message
429+
)
430+
field_name = field_match.group(1) if field_match else "unknown"
431+
432+
# Parse for constraint information
433+
constraint_match = re.search(
434+
r"\['([^']+)'\]\s*:\s*'([^']+)'", error_message
435+
)
436+
constraint = (
437+
constraint_match.group(1) if constraint_match else "unknown"
438+
)
439+
bad_value = constraint_match.group(2) if constraint_match else "unknown"
440+
441+
# Build helpful error message
442+
helpful_message = (
443+
f"Invalid JSON Schema for document class '{document_class}'.\n\n"
444+
f"Problem detected:\n"
445+
f" Field: {field_name}\n"
446+
f" Constraint: {constraint}\n"
447+
f" Current value: '{bad_value}' (type: {type(bad_value).__name__})\n\n"
448+
f"Common fixes:\n"
449+
f" 1. If '{constraint}' should be a number, remove quotes in your config:\n"
450+
f" {constraint}: '{bad_value}' → {constraint}: {bad_value}\n"
451+
f" 2. Check your config YAML for field '{field_name}' in class '{document_class}'\n"
452+
f" 3. Ensure all numeric constraints (maxItems, minItems, minimum, maximum, etc.) are numbers, not strings\n\n"
453+
f"Original error: {error_message}"
454+
)
455+
456+
logger.error(helpful_message)
457+
logger.error(
458+
f"Full schema that caused the error:\n{json.dumps(schema, indent=2, default=str)}"
459+
)
460+
raise ValueError(helpful_message) from e
461+
else:
462+
# Re-raise other errors with schema details
463+
logger.error(
464+
f"Unexpected error creating Stickler model for {document_class}: {error_message}"
465+
)
466+
logger.error(
467+
f"Schema being processed:\n{json.dumps(schema, indent=2, default=str)}"
468+
)
469+
raise
392470

393471
# Create the model using Pydantic's create_model
394472
from pydantic import create_model

lib/idp_common_pkg/idp_common/evaluation/stickler_mapper.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,83 @@ def _coerce_to_float(cls, value: Any, field_name: str = "") -> float:
294294
f"Field '{field_name}': Expected numeric value, got {type(value).__name__}"
295295
)
296296

297+
@classmethod
298+
def _coerce_json_schema_types(
299+
cls, schema: Dict[str, Any], field_path: str = ""
300+
) -> None:
301+
"""
302+
Coerce string values to proper JSON Schema types.
303+
304+
This fixes common issues where numeric constraints are provided as strings
305+
instead of numbers (e.g., maxItems: '7' should be maxItems: 7).
306+
307+
Args:
308+
schema: Schema to coerce (modified in-place)
309+
field_path: Current path for error messages
310+
"""
311+
if not isinstance(schema, dict):
312+
return
313+
314+
# Numeric constraints that must be integers
315+
INTEGER_CONSTRAINTS = [
316+
"maxItems",
317+
"minItems",
318+
"maxLength",
319+
"minLength",
320+
"maxProperties",
321+
"minProperties",
322+
"multipleOf",
323+
]
324+
325+
# Numeric constraints that must be numbers (int or float)
326+
NUMBER_CONSTRAINTS = [
327+
"minimum",
328+
"maximum",
329+
"exclusiveMinimum",
330+
"exclusiveMaximum",
331+
]
332+
333+
for key, value in list(schema.items()):
334+
# Coerce integer constraints
335+
if key in INTEGER_CONSTRAINTS and isinstance(value, str):
336+
try:
337+
schema[key] = int(value)
338+
logger.info(
339+
f"Field '{field_path}': Coerced {key} from string '{value}' to integer {schema[key]}"
340+
)
341+
except ValueError:
342+
logger.error(
343+
f"Field '{field_path}': Cannot coerce {key}='{value}' to integer. "
344+
f"This will cause validation errors."
345+
)
346+
347+
# Coerce number constraints
348+
elif key in NUMBER_CONSTRAINTS and isinstance(value, str):
349+
try:
350+
schema[key] = float(value)
351+
logger.info(
352+
f"Field '{field_path}': Coerced {key} from string '{value}' to float {schema[key]}"
353+
)
354+
except ValueError:
355+
logger.error(
356+
f"Field '{field_path}': Cannot coerce {key}='{value}' to number. "
357+
f"This will cause validation errors."
358+
)
359+
360+
# Recursively process nested schemas
361+
if SCHEMA_PROPERTIES in schema:
362+
for prop_name, prop_schema in schema[SCHEMA_PROPERTIES].items():
363+
prop_path = f"{field_path}.{prop_name}" if field_path else prop_name
364+
cls._coerce_json_schema_types(prop_schema, prop_path)
365+
366+
if SCHEMA_ITEMS in schema:
367+
items_path = f"{field_path}[]" if field_path else "items"
368+
cls._coerce_json_schema_types(schema[SCHEMA_ITEMS], items_path)
369+
370+
if "$defs" in schema:
371+
for def_name, def_schema in schema["$defs"].items():
372+
cls._coerce_json_schema_types(def_schema, f"$defs.{def_name}")
373+
297374
@classmethod
298375
def _translate_extensions_in_schema(
299376
cls, schema: Dict[str, Any], field_path: str = ""
@@ -323,6 +400,9 @@ def _translate_extensions_in_schema(
323400
if not isinstance(schema, dict):
324401
return schema
325402

403+
# Coerce types FIRST, before any other processing
404+
cls._coerce_json_schema_types(schema, field_path)
405+
326406
# If this is an object with properties but no required array, add empty one
327407
# This makes all fields optional, allowing None values
328408
if schema.get(SCHEMA_TYPE) == TYPE_OBJECT and SCHEMA_PROPERTIES in schema:

scripts/benchmark_utils/README.md

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# Benchmark Utilities
2+
3+
This directory contains utility scripts for working with benchmark and evaluation datasets.
4+
5+
## prep_baseline_data.py
6+
7+
Convert ground truth data from JSONL format to IDP Accelerator evaluation baseline format.
8+
9+
### Purpose
10+
11+
This script processes JSONL files containing document ground truth labels and converts them into the directory structure required by the IDP Accelerator's evaluation framework.
12+
13+
### Input Format
14+
15+
JSONL file where each line contains:
16+
```json
17+
{
18+
"document_path": "path/to/document.pdf",
19+
"labels": "{\"field1\": \"value1\", \"field2\": \"value2\", ...}"
20+
}
21+
```
22+
23+
### Output Format
24+
25+
Creates the following directory structure:
26+
```
27+
<output_base_path>/
28+
├── document1.pdf/
29+
│ └── sections/
30+
│ └── 1/
31+
│ └── result.json
32+
├── document2.pdf/
33+
│ └── sections/
34+
│ └── 1/
35+
│ └── result.json
36+
...
37+
```
38+
39+
Where each `result.json` contains:
40+
```json
41+
{
42+
"inference_result": {
43+
"field1": "value1",
44+
"field2": "value2",
45+
...
46+
}
47+
}
48+
```
49+
50+
### Usage
51+
52+
#### Basic Usage (Default Paths)
53+
```bash
54+
python prep_baseline_data.py
55+
```
56+
57+
Default paths:
58+
- **Input**: `scratch/fcc_invoices_reann_standardized_val_fixed_v0.jsonl`
59+
- **Output**: `scratch/accelerator/fcc_invoices/evaluation_baseline/`
60+
61+
#### Dry Run (Preview Only)
62+
```bash
63+
python prep_baseline_data.py --dry-run
64+
```
65+
66+
#### Custom Paths
67+
```bash
68+
python prep_baseline_data.py \
69+
--input path/to/your/ground_truth.jsonl \
70+
--output path/to/output/baseline/
71+
```
72+
73+
#### Overwrite Existing Files
74+
```bash
75+
python prep_baseline_data.py --overwrite
76+
```
77+
78+
#### Skip Validation
79+
```bash
80+
python prep_baseline_data.py --no-validate
81+
```
82+
83+
### Command-Line Options
84+
85+
| Option | Description | Default |
86+
|--------|-------------|---------|
87+
| `--input PATH` | Path to input JSONL file | `scratch/fcc_invoices_reann_standardized_val_fixed_v0.jsonl` |
88+
| `--output PATH` | Base path for output baseline files | `scratch/accelerator/fcc_invoices/evaluation_baseline` |
89+
| `--dry-run` | Simulate processing without creating files | False |
90+
| `--overwrite` | Overwrite existing baseline files | False |
91+
| `--validate` | Validate created files after processing | True |
92+
| `--no-validate` | Skip validation of created files | - |
93+
94+
### Features
95+
96+
- **Error Handling**: Gracefully handles malformed JSON, missing fields, and file system errors
97+
- **Duplicate Detection**: Warns about duplicate document IDs in the input file
98+
- **Progress Tracking**: Shows progress every 100 documents processed
99+
- **Validation**: Automatically validates a sample of created files
100+
- **Statistics**: Provides detailed summary of processing results
101+
- **Dry Run Mode**: Preview what would be created without writing files
102+
103+
### Output Summary
104+
105+
After processing, the script displays a summary including:
106+
- Total documents processed
107+
- Successfully created files
108+
- Skipped files (if not overwriting)
109+
- Failed operations
110+
- Duplicate document IDs
111+
- Error details
112+
- Success rate
113+
114+
Example output:
115+
```
116+
================================================================================
117+
PROCESSING SUMMARY
118+
================================================================================
119+
Total documents in file: 150
120+
Successfully processed: 148
121+
Skipped (already exist): 0
122+
Failed: 2
123+
Unique doc_ids: 148
124+
125+
Success rate: 98.7%
126+
================================================================================
127+
```
128+
129+
### Error Handling
130+
131+
The script handles various error scenarios:
132+
- **Missing input file**: Exits with clear error message
133+
- **Malformed JSON**: Logs line number and continues processing
134+
- **Missing required fields**: Logs error and skips document
135+
- **File system errors**: Logs error and continues with remaining documents
136+
- **Duplicate document IDs**: Warns but continues processing
137+
138+
### Exit Codes
139+
140+
- `0`: Success (all documents processed without errors)
141+
- `1`: Failure (fatal error or some documents failed)
142+
143+
### Examples
144+
145+
#### Process with default paths and see detailed output
146+
```bash
147+
python prep_baseline_data.py
148+
```
149+
150+
#### Test the script without creating files
151+
```bash
152+
python prep_baseline_data.py --dry-run
153+
```
154+
155+
#### Process a different dataset
156+
```bash
157+
python prep_baseline_data.py \
158+
--input data/invoice_labels.jsonl \
159+
--output baseline/invoices/
160+
```
161+
162+
#### Force overwrite of existing baseline files
163+
```bash
164+
python prep_baseline_data.py --overwrite
165+
```
166+
167+
### Integration with IDP Accelerator
168+
169+
Once baseline files are created, use them with the IDP Accelerator evaluation framework:
170+
171+
1. Upload the baseline directory to your evaluation S3 bucket
172+
2. Configure the evaluation framework to use this baseline
173+
3. Process documents through the IDP pipeline
174+
4. View evaluation reports comparing results to baseline
175+
176+
See `docs/evaluation.md` for more details on the evaluation framework.
177+
178+
### Troubleshooting
179+
180+
**Problem**: Script fails with "Input file not found"
181+
- **Solution**: Verify the input file path is correct
182+
183+
**Problem**: Permission denied when creating files
184+
- **Solution**: Ensure you have write permissions to the output directory
185+
186+
**Problem**: Out of memory errors
187+
- **Solution**: The script processes line-by-line and should handle large files. If issues persist, split the input file into smaller chunks.
188+
189+
**Problem**: Validation fails
190+
- **Solution**: Check the error messages for specific files, then inspect the result.json files manually
191+
192+
### License
193+
194+
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
195+
SPDX-License-Identifier: MIT-0

0 commit comments

Comments
 (0)