Add evaluation step to all processing patterns with EVALUATING status and UI support

Bob Strahan · Bob Strahan · commit 66a57f23b540 · 2025-10-24T12:01:51.000Z
diff --git a/lib/idp_common_pkg/idp_common/models.py b/lib/idp_common_pkg/idp_common/models.py
@@ -27,6 +27,7 @@ class Status(Enum):
     POSTPROCESSING = "POSTPROCESSING"  # Document summarization
     HITL_IN_PROGRESS = "HITL_IN_PROGRESS"  # Human-in-the-loop review in progress
     SUMMARIZING = "SUMMARIZING"  # Document summarization
+    EVALUATING = "EVALUATING"  # Document evaluation
     COMPLETED = "COMPLETED"  # All processing completed
     FAILED = "FAILED"  # Processing failed
 
diff --git a/patterns/pattern-1/statemachine/workflow.asl.json b/patterns/pattern-1/statemachine/workflow.asl.json
@@ -167,6 +167,33 @@
                     "BackoffRate": 2
                 }
             ],
+            "Next": "EvaluationStep"
+        },
+        "EvaluationStep": {
+            "Type": "Task",
+            "Resource": "${EvaluationLambdaArn}",
+            "Parameters": {
+                "execution_arn.$": "$$.Execution.Id",
+                "document.$": "$.Result.document"
+            },
+            "ResultPath": "$.Result",
+            "Retry": [
+                {
+                    "ErrorEquals": [
+                        "Lambda.ServiceException",
+                        "Lambda.AWSLambdaException",
+                        "Lambda.SdkClientException",
+                        "Lambda.TooManyRequestsException",
+                        "ServiceQuotaExceededException",
+                        "ThrottlingException",
+                        "ProvisionedThroughputExceededException",
+                        "RequestLimitExceeded"
+                    ],
+                    "IntervalSeconds": 2,
+                    "MaxAttempts": 10,
+                    "BackoffRate": 2
+                }
+            ],
             "Next": "WorkflowComplete"
         },
         "WorkflowComplete": {
diff --git a/patterns/pattern-1/template.yaml b/patterns/pattern-1/template.yaml
@@ -94,6 +94,10 @@ Parameters:
   ConfigLibraryHash:
     Type: String
     Description: "Hash token from config library to force updates when config library changes"
+
+  EvaluationFunctionArn:
+    Type: String
+    Description: "ARN of the Evaluation Lambda function"
     
   EnableHITL:
     Type: String
@@ -362,6 +366,11 @@ Resources:
             type: object
             sectionLabel: Evaluation Inference
             properties:
+              enabled:
+                type: boolean
+                description: Enable or disable evaluation processing
+                default: true
+                order: 0
               llm_method:
                 type: object
                 properties:
@@ -648,6 +657,7 @@ Resources:
         HITLWaitFunctionArn: !GetAtt HITLWaitFunction.Arn
         HITLStatusUpdateFunctionArn: !GetAtt HITLStatusUpdateFunction.Arn
         SummarizationLambdaArn: !GetAtt SummarizationFunction.Arn
+        EvaluationLambdaArn: !Ref EvaluationFunctionArn
         EnableHITL: !Ref EnableHITL
         OutputBucket: !Ref OutputBucket
         WorkingBucket: !Ref WorkingBucket
@@ -670,6 +680,8 @@ Resources:
             FunctionName: !Ref HITLWaitFunction
         - LambdaInvokePolicy:
             FunctionName: !Ref HITLStatusUpdateFunction
+        - LambdaInvokePolicy:
+            FunctionName: !Ref EvaluationFunctionArn
         - CloudWatchLogsFullAccess
 
   StateMachineLogGroup:
diff --git a/patterns/pattern-2/statemachine/workflow.asl.json b/patterns/pattern-2/statemachine/workflow.asl.json
@@ -252,6 +252,34 @@
                     "BackoffRate": 2
                 }
             ],
+            "Next": "EvaluationStep"
+        },
+        "EvaluationStep": {
+            "Type": "Task",
+            "Resource": "${EvaluationLambdaArn}",
+            "Parameters": {
+                "execution_arn.$": "$$.Execution.Id",
+                "document.$": "$"
+            },
+            "ResultPath": "$",
+            "Retry": [
+                {
+                    "ErrorEquals": [
+                        "Sandbox.Timedout",
+                        "Lambda.ServiceException",
+                        "Lambda.AWSLambdaException",
+                        "Lambda.SdkClientException",
+                        "Lambda.TooManyRequestsException",
+                        "ServiceQuotaExceededException",
+                        "ThrottlingException",
+                        "ProvisionedThroughputExceededException",
+                        "RequestLimitExceeded"
+                    ],
+                    "IntervalSeconds": 2,
+                    "MaxAttempts": 10,
+                    "BackoffRate": 2
+                }
+            ],
             "Next": "WorkflowComplete"
         },
         "WorkflowComplete": {
diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml
@@ -100,6 +100,10 @@ Parameters:
     Type: String
     Description: "Hash token from config library to force updates when config library changes"
 
+  EvaluationFunctionArn:
+    Type: String
+    Description: "ARN of the Evaluation Lambda function"
+
   EnableXRayTracing:
     Type: String
     Default: 'false'
@@ -1108,6 +1112,11 @@ Resources:
             type: object
             sectionLabel: Evaluation Inference
             properties:
+              enabled:
+                type: boolean
+                description: Enable or disable evaluation processing
+                default: true
+                order: 0
               llm_method:
                 type: object
                 properties:
@@ -2330,6 +2339,7 @@ Resources:
         HITLWaitFunctionArn: !GetAtt HITLWaitFunction.Arn
         HITLStatusUpdateFunctionArn: !GetAtt HITLStatusUpdateFunction.Arn
         SummarizationLambdaArn: !GetAtt SummarizationFunction.Arn
+        EvaluationLambdaArn: !Ref EvaluationFunctionArn
         OutputBucket: !Ref OutputBucket
       Logging:
         Level: ALL
@@ -2355,6 +2365,8 @@ Resources:
             FunctionName: !Ref HITLStatusUpdateFunction
         - LambdaInvokePolicy:
             FunctionName: !Ref SummarizationFunction
+        - LambdaInvokePolicy:
+            FunctionName: !Ref EvaluationFunctionArn
         - CloudWatchLogsFullAccess
 
   StateMachineLogGroup:
diff --git a/patterns/pattern-3/statemachine/workflow.asl.json b/patterns/pattern-3/statemachine/workflow.asl.json
@@ -187,6 +187,34 @@
                     "BackoffRate": 2
                 }
             ],
+            "Next": "EvaluationStep"
+        },
+        "EvaluationStep": {
+            "Type": "Task",
+            "Resource": "${EvaluationLambdaArn}",
+            "Parameters": {
+                "execution_arn.$": "$$.Execution.Id",
+                "document.$": "$"
+            },
+            "ResultPath": "$",
+            "Retry": [
+                {
+                    "ErrorEquals": [
+                        "Sandbox.Timedout",
+                        "Lambda.ServiceException",
+                        "Lambda.AWSLambdaException",
+                        "Lambda.SdkClientException",
+                        "Lambda.TooManyRequestsException",
+                        "ServiceQuotaExceededException",
+                        "ThrottlingException",
+                        "ProvisionedThroughputExceededException",
+                        "RequestLimitExceeded"
+                    ],
+                    "IntervalSeconds": 2,
+                    "MaxAttempts": 10,
+                    "BackoffRate": 2
+                }
+            ],
             "Next": "WorkflowComplete"
         },
         "WorkflowComplete": {
diff --git a/patterns/pattern-3/template.yaml b/patterns/pattern-3/template.yaml
@@ -92,6 +92,10 @@ Parameters:
     Type: String
     Description: "Hash token from config library to force updates when config library changes"
 
+  EvaluationFunctionArn:
+    Type: String
+    Description: "ARN of the Evaluation Lambda function"
+
   PermissionsBoundaryArn:
     Type: String
     Default: ""
@@ -700,6 +704,11 @@ Resources:
             type: object
             sectionLabel: Evaluation Inference
             properties:
+              enabled:
+                type: boolean
+                description: Enable or disable evaluation processing
+                default: true
+                order: 0
               llm_method:
                 type: object
                 properties:
@@ -1452,6 +1461,7 @@ Resources:
         AssessmentFunctionArn: !GetAtt AssessmentFunction.Arn
         ProcessResultsLambdaArn: !GetAtt ProcessResultsFunction.Arn
         SummarizationLambdaArn: !GetAtt SummarizationFunction.Arn
+        EvaluationLambdaArn: !Ref EvaluationFunctionArn
         OutputBucket: !Ref OutputBucket
       Logging:
         Level: ALL
@@ -1471,6 +1481,8 @@ Resources:
             FunctionName: !Ref ProcessResultsFunction
         - LambdaInvokePolicy:
             FunctionName: !Ref SummarizationFunction
+        - LambdaInvokePolicy:
+            FunctionName: !Ref EvaluationFunctionArn
         - CloudWatchLogsFullAccess
 
   StateMachineLogGroup:
diff --git a/src/lambda/evaluation_function/index.py b/src/lambda/evaluation_function/index.py
@@ -60,10 +60,10 @@ def update_document_evaluation_status(document: Document, status: EvaluationStat
 
 def extract_document_from_event(event: Dict[str, Any]) -> Optional[Document]:
     """
-    Extract document from Lambda event
+    Extract document from Lambda event (state machine format)
     
     Args:
-        event: Lambda event
+        event: Lambda event containing document data
         
     Returns:
         Document object or None if not found
@@ -72,17 +72,16 @@ def extract_document_from_event(event: Dict[str, Any]) -> Optional[Document]:
         ValueError: If document cannot be extracted from event
     """
     try:
-        output_data = json.loads(event['detail']['output'])
+        # State machine format: event['document'] contains the document data
+        document_data = event.get('document')
         
-        if not output_data:
-            raise ValueError("No output data found in event")
+        if not document_data:
+            raise ValueError("No document data found in event")
                        
-        # Get document from the final processing step
+        # Get document from state machine format
         working_bucket = os.environ.get('WORKING_BUCKET')
-        # look for document_data in either output_data.Result.document (Pattern-1) or output_data (others)
-        document_data = output_data.get('Result',{}).get('document', output_data)
         document = Document.load_document(document_data, working_bucket, logger)
-        logger.info(f"Successfully loaded actual document with {len(document.pages)} pages and {len(document.sections)} sections")
+        logger.info(f"Successfully loaded document with {len(document.pages)} pages and {len(document.sections)} sections")
         return document
     except Exception as e:
         logger.error(f"Error extracting document from event: {str(e)}")
@@ -154,34 +153,43 @@ def handler(event, context):
         context: Lambda context
         
     Returns:
-        Response with evaluation results
+        Document in state machine format: {'document': document.serialize_document()}
     """
     actual_document = None
     start_time = time.time()
     
     try:
-        logger.info(f"Starting evaluation process with event: {json.dumps(event, indent=2)}")
+        logger.info(f"Starting evaluation process")
         
         # Extract document from event
         actual_document = extract_document_from_event(event)
         
-        # Update document status to RUNNING
+        # Load configuration and check if evaluation is enabled
+        config = get_config()
+        evaluation_enabled = config.get('evaluation', {}).get('enabled', True)
+        
+        if not evaluation_enabled:
+            logger.info("Evaluation is disabled in configuration, skipping evaluation")
+            # Return document unchanged
+            return {'document': actual_document.serialize_document()}
+        
+        # Set document status to EVALUATING before processing
+        actual_document.status = Status.EVALUATING
+        document_service.update_document(actual_document)
+        
+        # Update document evaluation status to RUNNING
         update_document_evaluation_status(actual_document, EvaluationStatus.RUNNING)
         
         # Load baseline document
         expected_document = load_baseline_document(actual_document.input_key)
         
         # If no baseline document is found, update status and exit
         if not expected_document:
-            update_document_evaluation_status(actual_document, EvaluationStatus.NO_BASELINE)
-            return create_response(
-                200,
-                'Evaluation skipped - no baseline data available',
-                {'document_key': actual_document.input_key}
-            )
+            actual_document = update_document_evaluation_status(actual_document, EvaluationStatus.NO_BASELINE)
+            logger.info("Evaluation skipped - no baseline data available")
+            return {'document': actual_document.serialize_document()}
         
-        # Load configuration and create evaluation service
-        config = get_config()
+        # Create evaluation service
         evaluation_service = evaluation.EvaluationService(config=config)
         
         # Run evaluation
@@ -196,8 +204,8 @@ def handler(event, context):
         if evaluated_document.errors:
             error_msg = f"Evaluation encountered errors: {evaluated_document.errors}"
             logger.error(error_msg)
-            update_document_evaluation_status(evaluated_document, EvaluationStatus.FAILED)
-            return create_response(500, 'Evaluation failed', {'error': error_msg})
+            evaluated_document = update_document_evaluation_status(evaluated_document, EvaluationStatus.FAILED)
+            return {'document': evaluated_document.serialize_document()}
        
         # Save evaluation results to reporting bucket for analytics using the SaveReportingData Lambda
         try:
@@ -224,18 +232,11 @@ def handler(event, context):
             # Continue execution - don't fail the entire function if reporting fails
         
         # Update document evaluation status to COMPLETED
-        update_document_evaluation_status(evaluated_document, EvaluationStatus.COMPLETED)
-        logger.info("Evaluation process completed successfully")
-        
-        # Return success response
-        return create_response(
-            200,
-            'Evaluation completed successfully',
-            {
-                'report_location': evaluated_document.evaluation_report_uri,
-                'execution_time': time.time() - start_time
-            }
-        )
+        evaluated_document = update_document_evaluation_status(evaluated_document, EvaluationStatus.COMPLETED)
+        logger.info(f"Evaluation process completed successfully in {time.time() - start_time:.2f} seconds")
+        
+        # Return document in state machine format
+        return {'document': evaluated_document.serialize_document()}
     
     except Exception as e:
         error_msg = f"Error in lambda_handler: {str(e)}"
@@ -244,8 +245,10 @@ def handler(event, context):
         # Update document status to FAILED if we have the document
         if actual_document:
             try:
-                update_document_evaluation_status(actual_document, EvaluationStatus.FAILED)
+                actual_document = update_document_evaluation_status(actual_document, EvaluationStatus.FAILED)
+                return {'document': actual_document.serialize_document()}
             except Exception as update_error:
                 logger.error(f"Failed to update evaluation status: {str(update_error)}")
         
-        return create_response(500, 'Evaluation failed', {'error': error_msg})
+        # Re-raise exception to let Step Functions handle the error
+        raise
diff --git a/src/ui/src/components/step-function-flow/FlowDiagram.jsx b/src/ui/src/components/step-function-flow/FlowDiagram.jsx
@@ -22,6 +22,11 @@ const isStepDisabled = (stepName, config) => {
     return config.assessment?.enabled === false;
   }
 
+  // Check if this is an evaluation step
+  if (stepNameLower.includes('evaluation') || stepNameLower.includes('evaluate')) {
+    return config.evaluation?.enabled === false;
+  }
+
   return false;
 };
 
@@ -238,6 +243,9 @@ FlowDiagram.propTypes = {
     assessment: PropTypes.shape({
       enabled: PropTypes.bool,
     }),
+    evaluation: PropTypes.shape({
+      enabled: PropTypes.bool,
+    }),
   }),
 };
 
diff --git a/src/ui/src/components/step-function-flow/StepDetails.jsx b/src/ui/src/components/step-function-flow/StepDetails.jsx
diff --git a/src/ui/src/components/step-function-flow/StepFunctionFlowViewer.jsx b/src/ui/src/components/step-function-flow/StepFunctionFlowViewer.jsx
diff --git a/template.yaml b/template.yaml