Updated setp functions tool

webarch-ai · webarch-ai · commit 413792913bd7 · 2025-11-03T16:16:28.000-05:00
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/agent.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/agent.py
@@ -15,11 +15,11 @@
 
 from ..common.strands_bedrock_model import create_strands_bedrock_model
 from .tools import (
+    analyze_workflow_execution,
     fetch_document_record,
     fetch_recent_records,
     lambda_lookup,
     search_cloudwatch_logs,
-    stepfunction_details,
     xray_performance_analysis,
     xray_trace,
 )
@@ -53,7 +53,7 @@ def create_error_analyzer_agent(
         fetch_document_record,
         fetch_recent_records,
         lambda_lookup,
-        stepfunction_details,
+        analyze_workflow_execution,
         xray_trace,
         xray_performance_analysis,
     ]
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/__init__.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/__init__.py
@@ -19,7 +19,7 @@
     fetch_recent_records,
 )
 from .lambda_tool import lambda_lookup
-from .stepfunction_tool import stepfunction_details
+from .stepfunction_tool import analyze_workflow_execution
 from .xray_tool import (
     xray_performance_analysis,
     xray_trace,
@@ -30,7 +30,7 @@
     "lambda_lookup",
     "fetch_document_record",
     "fetch_recent_records",
-    "stepfunction_details",
+    "analyze_workflow_execution",
     "xray_trace",
     "xray_performance_analysis",
 ]
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/stepfunction_tool.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/stepfunction_tool.py
@@ -13,14 +13,184 @@
 
 from idp_common.config import get_config
 
-from ..config import (
-    create_error_response,
-    create_response,
-)
-
 logger = logging.getLogger(__name__)
 
 
+@tool
+def analyze_workflow_execution(execution_arn: str) -> Dict[str, Any]:
+    """
+    Analyze Step Function workflow execution to identify failures and state transitions.
+
+    Performs comprehensive analysis of document processing workflow executions by
+    retrieving execution history, analyzing state transitions, identifying failure
+    points, and providing actionable recommendations. Essential for troubleshooting
+    document processing failures and understanding workflow behavior.
+
+    Use this tool when:
+    - Document processing failed and you have a Step Function execution ARN
+    - Need to understand where in the workflow a failure occurred
+    - Investigating workflow performance or timeout issues
+    - Analyzing state transitions and execution timeline
+    - User reports document processing stuck or failed
+
+    Tool chaining: Get execution ARN from fetch_document_record, then use this tool
+    for detailed workflow analysis. Follow up with search_cloudwatch_logs for
+    specific Lambda function errors identified in the failure analysis.
+
+    Example usage:
+    - "Analyze the workflow execution for this document"
+    - "What went wrong in the Step Function execution?"
+    - "Show me the workflow timeline and failure point"
+    - "Why did the document processing workflow fail?"
+    - "Trace the execution flow and identify issues"
+
+    Args:
+        execution_arn: Step Function execution ARN (get from document record's WorkflowExecutionArn or ExecutionArn field)
+
+    Returns:
+        Dict with keys:
+        - execution_status (str): Overall execution status (SUCCEEDED, FAILED, TIMED_OUT, etc.)
+        - duration_seconds (float): Total execution duration if completed
+        - timeline_analysis (dict): Detailed timeline with state transitions and failure point
+        - analysis_summary (str): Human-readable summary of execution and failure
+        - recommendations (list): Actionable next steps for investigation
+    """
+    try:
+        if not execution_arn:
+            return _build_response(
+                execution_status="ERROR",
+                analysis_summary="No execution ARN provided",
+                recommendations=[
+                    "Use search_cloudwatch_logs for detailed error information"
+                ],
+            )
+
+        # Get execution data from Step Functions
+        execution_data = _get_execution_data(execution_arn)
+
+        # Analyze timeline and failures
+        timeline_analysis = _analyze_execution_timeline(execution_data["events"])
+
+        # Extract execution metadata
+        execution_metadata = _extract_execution_metadata(
+            execution_data["execution_response"]
+        )
+
+        # Build analysis summary
+        analysis_summary = _build_analysis_summary(
+            execution_metadata["status"], timeline_analysis
+        )
+
+        # Generate recommendations
+        recommendations = _generate_recommendations(timeline_analysis)
+
+        return _build_response(
+            execution_status=execution_metadata["status"],
+            duration_seconds=execution_metadata["duration_seconds"],
+            timeline_analysis=timeline_analysis,
+            analysis_summary=analysis_summary,
+            recommendations=recommendations,
+        )
+
+    except Exception as e:
+        logger.error(f"Error analyzing Step Function execution {execution_arn}: {e}")
+        return _build_response(
+            execution_status="ERROR",
+            analysis_summary=f"Failed to analyze workflow execution: {str(e)}",
+            recommendations=[
+                "Use search_cloudwatch_logs for detailed error information"
+            ],
+        )
+
+
+def _get_execution_data(execution_arn: str) -> Dict[str, Any]:
+    """
+    Retrieve execution details and history from Step Functions.
+    """
+    stepfunctions_client = boto3.client("stepfunctions")
+
+    execution_response = stepfunctions_client.describe_execution(
+        executionArn=execution_arn
+    )
+
+    history_response = stepfunctions_client.get_execution_history(
+        executionArn=execution_arn,
+        maxResults=100,
+        reverseOrder=True,  # Most recent events first
+    )
+
+    return {
+        "execution_response": execution_response,
+        "events": history_response.get("events", []),
+    }
+
+
+def _extract_execution_metadata(execution_response: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract execution metadata including status and duration.
+    """
+    execution_status = execution_response.get("status", "UNKNOWN")
+    start_date = execution_response.get("startDate")
+    stop_date = execution_response.get("stopDate")
+
+    duration_seconds = None
+    if start_date and stop_date:
+        duration_seconds = (stop_date - start_date).total_seconds()
+
+    return {"status": execution_status, "duration_seconds": duration_seconds}
+
+
+def _build_analysis_summary(
+    execution_status: str, timeline_analysis: Dict[str, Any]
+) -> str:
+    """
+    Build human-readable analysis summary.
+    """
+    analysis_summary = f"Step Function execution {execution_status}"
+
+    if timeline_analysis.get("failure_point"):
+        failure_point = timeline_analysis["failure_point"]
+        analysis_summary += f" at state '{failure_point.get('state', 'Unknown')}'"
+        if failure_point.get("details", {}).get("error"):
+            analysis_summary += f": {failure_point['details']['error']}"
+
+    return analysis_summary
+
+
+def _generate_recommendations(timeline_analysis: Dict[str, Any]) -> List[str]:
+    """
+    Generate actionable recommendations based on analysis.
+    """
+    return [
+        "Check the failure point state for specific error details",
+        "Review Lambda function logs if failure occurred in Lambda task",
+        "Verify input data format if failure occurred early in workflow",
+        "Consider timeout adjustments if execution timed out",
+    ]
+
+
+def _build_response(
+    execution_status: str,
+    duration_seconds: Optional[float] = None,
+    timeline_analysis: Optional[Dict[str, Any]] = None,
+    analysis_summary: str = "",
+    recommendations: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """
+    Build unified workflow analysis response with logging.
+    """
+    response = {
+        "execution_status": execution_status,
+        "duration_seconds": duration_seconds,
+        "timeline_analysis": timeline_analysis or {},
+        "analysis_summary": analysis_summary,
+        "recommendations": recommendations or [],
+    }
+
+    logger.info(f"Workflow analysis response: {response}")
+    return response
+
+
 def _extract_failure_details(event: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     """
     Failure parser: Extracts detailed error information from Step Function events.
@@ -152,77 +322,3 @@ def _analyze_execution_timeline(events: List[Dict[str, Any]]) -> Dict[str, Any]:
         "failure_point": failure_point,
         "last_successful_state": last_successful_state,
     }
-
-
-@tool
-def stepfunction_details(execution_arn: str) -> Dict[str, Any]:
-    """
-    Analyze Step Function execution to identify workflow failures and state transitions.
-    Retrieves execution history and performs comprehensive analysis to identify failure points,
-    state transitions, and execution patterns for document processing workflows.
-
-    Args:
-        execution_arn: Step Function execution ARN from document context
-
-    Returns:
-        Dict containing execution analysis, timeline, and failure details
-    """
-    try:
-        if not execution_arn:
-            return create_error_response("No execution ARN provided")
-
-        stepfunctions_client = boto3.client("stepfunctions")
-
-        # Get execution details
-        execution_response = stepfunctions_client.describe_execution(
-            executionArn=execution_arn
-        )
-
-        # Get execution history
-        history_response = stepfunctions_client.get_execution_history(
-            executionArn=execution_arn,
-            maxResults=100,
-            reverseOrder=True,  # Most recent events first
-        )
-
-        events = history_response.get("events", [])
-
-        # Analyze timeline and failures
-        timeline_analysis = _analyze_execution_timeline(events)
-
-        # Extract execution metadata
-        execution_status = execution_response.get("status", "UNKNOWN")
-        start_date = execution_response.get("startDate")
-        stop_date = execution_response.get("stopDate")
-
-        # Calculate execution duration
-        duration_seconds = None
-        if start_date and stop_date:
-            duration_seconds = (stop_date - start_date).total_seconds()
-
-        # Build analysis summary
-        analysis_summary = f"Step Function execution {execution_status}"
-        if timeline_analysis.get("failure_point"):
-            failure_point = timeline_analysis["failure_point"]
-            analysis_summary += f" at state '{failure_point.get('state', 'Unknown')}'"
-            if failure_point.get("details", {}).get("error"):
-                analysis_summary += f": {failure_point['details']['error']}"
-
-        return create_response(
-            {
-                "execution_status": execution_status,
-                "duration_seconds": duration_seconds,
-                "timeline_analysis": timeline_analysis,
-                "analysis_summary": analysis_summary,
-                "recommendations": [
-                    "Check the failure point state for specific error details",
-                    "Review Lambda function logs if failure occurred in Lambda task",
-                    "Verify input data format if failure occurred early in workflow",
-                    "Consider timeout adjustments if execution timed out",
-                ],
-            }
-        )
-
-    except Exception as e:
-        logger.error(f"Error analyzing Step Function execution {execution_arn}: {e}")
-        return create_error_response(str(e))
diff --git a/lib/idp_common_pkg/tests/unit/agents/error_analyzer/test_agent.py b/lib/idp_common_pkg/tests/unit/agents/error_analyzer/test_agent.py
@@ -102,14 +102,14 @@ def test_agent_system_prompt_format(
     def test_specific_tools_import(self):
         """Test that specific tools can be imported correctly."""
         from idp_common.agents.error_analyzer.tools import (
-            search_cloudwatch_logs,
+            analyze_workflow_execution,
             fetch_document_record,
-            xray_trace,
+            search_cloudwatch_logs,
         )
 
         assert search_cloudwatch_logs is not None
         assert callable(search_cloudwatch_logs)
         assert fetch_document_record is not None
         assert callable(fetch_document_record)
-        assert xray_trace is not None
-        assert callable(xray_trace)
+        assert analyze_workflow_execution is not None
+        assert callable(analyze_workflow_execution)
diff --git a/lib/idp_common_pkg/tests/unit/agents/error_analyzer/test_tools.py b/lib/idp_common_pkg/tests/unit/agents/error_analyzer/test_tools.py
@@ -36,14 +36,14 @@ def test_dynamodb_tools_import(self):
     def test_execution_context_tools_import(self):
         """Test execution context tools can be imported."""
         from idp_common.agents.error_analyzer.tools import (
+            analyze_workflow_execution,
             lambda_lookup,
-            stepfunction_details,
         )
 
         assert lambda_lookup is not None
         assert callable(lambda_lookup)
-        assert stepfunction_details is not None
-        assert callable(stepfunction_details)
+        assert analyze_workflow_execution is not None
+        assert callable(analyze_workflow_execution)
 
     def test_xray_tools_import(self):
         """Test X-Ray tools can be imported."""
@@ -66,7 +66,7 @@ def test_all_tools_available(self):
             "fetch_document_record",
             "fetch_recent_records",
             "lambda_lookup",
-            "stepfunction_details",
+            "analyze_workflow_execution",
             "xray_trace",
             "xray_performance_analysis",
         }