Add benchmark utilities for baseline data preparation

Bob Strahan · Bob Strahan · commit 53bb75e11b0c · 2025-11-14T13:32:06.000Z
diff --git a/lib/idp_common_pkg/idp_common/evaluation/service.py b/lib/idp_common_pkg/idp_common/evaluation/service.py
@@ -378,17 +378,95 @@ def _get_stickler_model(
         schema = stickler_config["schema"]
         model_name = stickler_config["model_name"]
 
-        logger.info(f"Creating Stickler model for class: {document_class}")
-
-        # Use JsonSchemaFieldConverter to handle the full JSON Schema natively
-        from stickler.structured_object_evaluator.models.json_schema_field_converter import (
-            JsonSchemaFieldConverter,
+        # Enhanced logging: Log schema details before creating model
+        logger.info(
+            f"Creating Stickler model for class: {document_class}\n"
+            f"  Schema summary:\n"
+            f"    - Properties: {list(schema.get('properties', {}).keys())}\n"
+            f"    - Required fields: {schema.get('required', [])}\n"
+            f"    - Schema ID: {schema.get('$id', 'N/A')}\n"
+            f"    - Model name: {model_name}"
         )
 
-        converter = JsonSchemaFieldConverter(schema)
-        field_definitions = converter.convert_properties_to_fields(
-            schema.get("properties", {}), schema.get("required", [])
-        )
+        # Log expected and actual data structure for troubleshooting
+        if expected_data:
+            logger.info(
+                f"  Expected data keys for {document_class}: {list(expected_data.keys())}"
+            )
+
+        try:
+            # Use JsonSchemaFieldConverter to handle the full JSON Schema natively
+            from stickler.structured_object_evaluator.models.json_schema_field_converter import (
+                JsonSchemaFieldConverter,
+            )
+
+            logger.debug(f"Converting schema properties for {document_class}")
+
+            converter = JsonSchemaFieldConverter(schema)
+            field_definitions = converter.convert_properties_to_fields(
+                schema.get("properties", {}), schema.get("required", [])
+            )
+
+            logger.info(
+                f"Successfully converted schema for {document_class} with {len(field_definitions)} fields"
+            )
+
+        except Exception as e:
+            # Enhanced error handling with user guidance
+            import json
+            import re
+
+            error_message = str(e)
+
+            # Check if it's a JSON Schema validation error
+            if (
+                "jsonschema.exceptions.SchemaError" in str(type(e))
+                or "Invalid JSON Schema" in error_message
+            ):
+                # Try to extract the problematic field from the error
+                field_match = re.search(
+                    r"On schema\['properties'\]\['([^']+)'\]", error_message
+                )
+                field_name = field_match.group(1) if field_match else "unknown"
+
+                # Parse for constraint information
+                constraint_match = re.search(
+                    r"\['([^']+)'\]\s*:\s*'([^']+)'", error_message
+                )
+                constraint = (
+                    constraint_match.group(1) if constraint_match else "unknown"
+                )
+                bad_value = constraint_match.group(2) if constraint_match else "unknown"
+
+                # Build helpful error message
+                helpful_message = (
+                    f"Invalid JSON Schema for document class '{document_class}'.\n\n"
+                    f"Problem detected:\n"
+                    f"  Field: {field_name}\n"
+                    f"  Constraint: {constraint}\n"
+                    f"  Current value: '{bad_value}' (type: {type(bad_value).__name__})\n\n"
+                    f"Common fixes:\n"
+                    f"  1. If '{constraint}' should be a number, remove quotes in your config:\n"
+                    f"     {constraint}: '{bad_value}' → {constraint}: {bad_value}\n"
+                    f"  2. Check your config YAML for field '{field_name}' in class '{document_class}'\n"
+                    f"  3. Ensure all numeric constraints (maxItems, minItems, minimum, maximum, etc.) are numbers, not strings\n\n"
+                    f"Original error: {error_message}"
+                )
+
+                logger.error(helpful_message)
+                logger.error(
+                    f"Full schema that caused the error:\n{json.dumps(schema, indent=2, default=str)}"
+                )
+                raise ValueError(helpful_message) from e
+            else:
+                # Re-raise other errors with schema details
+                logger.error(
+                    f"Unexpected error creating Stickler model for {document_class}: {error_message}"
+                )
+                logger.error(
+                    f"Schema being processed:\n{json.dumps(schema, indent=2, default=str)}"
+                )
+                raise
 
         # Create the model using Pydantic's create_model
         from pydantic import create_model
diff --git a/lib/idp_common_pkg/idp_common/evaluation/stickler_mapper.py b/lib/idp_common_pkg/idp_common/evaluation/stickler_mapper.py
@@ -294,6 +294,83 @@ def _coerce_to_float(cls, value: Any, field_name: str = "") -> float:
             f"Field '{field_name}': Expected numeric value, got {type(value).__name__}"
         )
 
+    @classmethod
+    def _coerce_json_schema_types(
+        cls, schema: Dict[str, Any], field_path: str = ""
+    ) -> None:
+        """
+        Coerce string values to proper JSON Schema types.
+
+        This fixes common issues where numeric constraints are provided as strings
+        instead of numbers (e.g., maxItems: '7' should be maxItems: 7).
+
+        Args:
+            schema: Schema to coerce (modified in-place)
+            field_path: Current path for error messages
+        """
+        if not isinstance(schema, dict):
+            return
+
+        # Numeric constraints that must be integers
+        INTEGER_CONSTRAINTS = [
+            "maxItems",
+            "minItems",
+            "maxLength",
+            "minLength",
+            "maxProperties",
+            "minProperties",
+            "multipleOf",
+        ]
+
+        # Numeric constraints that must be numbers (int or float)
+        NUMBER_CONSTRAINTS = [
+            "minimum",
+            "maximum",
+            "exclusiveMinimum",
+            "exclusiveMaximum",
+        ]
+
+        for key, value in list(schema.items()):
+            # Coerce integer constraints
+            if key in INTEGER_CONSTRAINTS and isinstance(value, str):
+                try:
+                    schema[key] = int(value)
+                    logger.info(
+                        f"Field '{field_path}': Coerced {key} from string '{value}' to integer {schema[key]}"
+                    )
+                except ValueError:
+                    logger.error(
+                        f"Field '{field_path}': Cannot coerce {key}='{value}' to integer. "
+                        f"This will cause validation errors."
+                    )
+
+            # Coerce number constraints
+            elif key in NUMBER_CONSTRAINTS and isinstance(value, str):
+                try:
+                    schema[key] = float(value)
+                    logger.info(
+                        f"Field '{field_path}': Coerced {key} from string '{value}' to float {schema[key]}"
+                    )
+                except ValueError:
+                    logger.error(
+                        f"Field '{field_path}': Cannot coerce {key}='{value}' to number. "
+                        f"This will cause validation errors."
+                    )
+
+        # Recursively process nested schemas
+        if SCHEMA_PROPERTIES in schema:
+            for prop_name, prop_schema in schema[SCHEMA_PROPERTIES].items():
+                prop_path = f"{field_path}.{prop_name}" if field_path else prop_name
+                cls._coerce_json_schema_types(prop_schema, prop_path)
+
+        if SCHEMA_ITEMS in schema:
+            items_path = f"{field_path}[]" if field_path else "items"
+            cls._coerce_json_schema_types(schema[SCHEMA_ITEMS], items_path)
+
+        if "$defs" in schema:
+            for def_name, def_schema in schema["$defs"].items():
+                cls._coerce_json_schema_types(def_schema, f"$defs.{def_name}")
+
     @classmethod
     def _translate_extensions_in_schema(
         cls, schema: Dict[str, Any], field_path: str = ""
@@ -323,6 +400,9 @@ def _translate_extensions_in_schema(
         if not isinstance(schema, dict):
             return schema
 
+        # Coerce types FIRST, before any other processing
+        cls._coerce_json_schema_types(schema, field_path)
+
         # If this is an object with properties but no required array, add empty one
         # This makes all fields optional, allowing None values
         if schema.get(SCHEMA_TYPE) == TYPE_OBJECT and SCHEMA_PROPERTIES in schema:
diff --git a/scripts/benchmark_utils/README.md b/scripts/benchmark_utils/README.md
@@ -0,0 +1,195 @@
+# Benchmark Utilities
+
+This directory contains utility scripts for working with benchmark and evaluation datasets.
+
+## prep_baseline_data.py
+
+Convert ground truth data from JSONL format to IDP Accelerator evaluation baseline format.
+
+### Purpose
+
+This script processes JSONL files containing document ground truth labels and converts them into the directory structure required by the IDP Accelerator's evaluation framework.
+
+### Input Format
+
+JSONL file where each line contains:
+```json
+{
+  "document_path": "path/to/document.pdf",
+  "labels": "{\"field1\": \"value1\", \"field2\": \"value2\", ...}"
+}
+```
+
+### Output Format
+
+Creates the following directory structure:
+```
+<output_base_path>/
+├── document1.pdf/
+│   └── sections/
+│       └── 1/
+│           └── result.json
+├── document2.pdf/
+│   └── sections/
+│       └── 1/
+│           └── result.json
+...
+```
+
+Where each `result.json` contains:
+```json
+{
+  "inference_result": {
+    "field1": "value1",
+    "field2": "value2",
+    ...
+  }
+}
+```
+
+### Usage
+
+#### Basic Usage (Default Paths)
+```bash
+python prep_baseline_data.py
+```
+
+Default paths:
+- **Input**: `scratch/fcc_invoices_reann_standardized_val_fixed_v0.jsonl`
+- **Output**: `scratch/accelerator/fcc_invoices/evaluation_baseline/`
+
+#### Dry Run (Preview Only)
+```bash
+python prep_baseline_data.py --dry-run
+```
+
+#### Custom Paths
+```bash
+python prep_baseline_data.py \
+  --input path/to/your/ground_truth.jsonl \
+  --output path/to/output/baseline/
+```
+
+#### Overwrite Existing Files
+```bash
+python prep_baseline_data.py --overwrite
+```
+
+#### Skip Validation
+```bash
+python prep_baseline_data.py --no-validate
+```
+
+### Command-Line Options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--input PATH` | Path to input JSONL file | `scratch/fcc_invoices_reann_standardized_val_fixed_v0.jsonl` |
+| `--output PATH` | Base path for output baseline files | `scratch/accelerator/fcc_invoices/evaluation_baseline` |
+| `--dry-run` | Simulate processing without creating files | False |
+| `--overwrite` | Overwrite existing baseline files | False |
+| `--validate` | Validate created files after processing | True |
+| `--no-validate` | Skip validation of created files | - |
+
+### Features
+
+- **Error Handling**: Gracefully handles malformed JSON, missing fields, and file system errors
+- **Duplicate Detection**: Warns about duplicate document IDs in the input file
+- **Progress Tracking**: Shows progress every 100 documents processed
+- **Validation**: Automatically validates a sample of created files
+- **Statistics**: Provides detailed summary of processing results
+- **Dry Run Mode**: Preview what would be created without writing files
+
+### Output Summary
+
+After processing, the script displays a summary including:
+- Total documents processed
+- Successfully created files
+- Skipped files (if not overwriting)
+- Failed operations
+- Duplicate document IDs
+- Error details
+- Success rate
+
+Example output:
+```
+================================================================================
+PROCESSING SUMMARY
+================================================================================
+Total documents in file:     150
+Successfully processed:      148
+Skipped (already exist):     0
+Failed:                      2
+Unique doc_ids:              148
+
+Success rate: 98.7%
+================================================================================
+```
+
+### Error Handling
+
+The script handles various error scenarios:
+- **Missing input file**: Exits with clear error message
+- **Malformed JSON**: Logs line number and continues processing
+- **Missing required fields**: Logs error and skips document
+- **File system errors**: Logs error and continues with remaining documents
+- **Duplicate document IDs**: Warns but continues processing
+
+### Exit Codes
+
+- `0`: Success (all documents processed without errors)
+- `1`: Failure (fatal error or some documents failed)
+
+### Examples
+
+#### Process with default paths and see detailed output
+```bash
+python prep_baseline_data.py
+```
+
+#### Test the script without creating files
+```bash
+python prep_baseline_data.py --dry-run
+```
+
+#### Process a different dataset
+```bash
+python prep_baseline_data.py \
+  --input data/invoice_labels.jsonl \
+  --output baseline/invoices/
+```
+
+#### Force overwrite of existing baseline files
+```bash
+python prep_baseline_data.py --overwrite
+```
+
+### Integration with IDP Accelerator
+
+Once baseline files are created, use them with the IDP Accelerator evaluation framework:
+
+1. Upload the baseline directory to your evaluation S3 bucket
+2. Configure the evaluation framework to use this baseline
+3. Process documents through the IDP pipeline
+4. View evaluation reports comparing results to baseline
+
+See `docs/evaluation.md` for more details on the evaluation framework.
+
+### Troubleshooting
+
+**Problem**: Script fails with "Input file not found"
+- **Solution**: Verify the input file path is correct
+
+**Problem**: Permission denied when creating files
+- **Solution**: Ensure you have write permissions to the output directory
+
+**Problem**: Out of memory errors
+- **Solution**: The script processes line-by-line and should handle large files. If issues persist, split the input file into smaller chunks.
+
+**Problem**: Validation fails
+- **Solution**: Check the error messages for specific files, then inspect the result.json files manually
+
+### License
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: MIT-0
diff --git a/scripts/benchmark_utils/prep_baseline_data.py b/scripts/benchmark_utils/prep_baseline_data.py