config update for analyzer agent

kazmer97 · kazmer97 · commit 0c4baa0101a2 · 2025-10-30T13:42:20.000Z
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/agent.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/agent.py
@@ -6,13 +6,14 @@
 """
 
 import logging
-from typing import Any, Dict
+from typing import Optional
 
 import boto3
 import strands
 
+from idp_common.config import get_config
+
 from ..common.strands_bedrock_model import create_strands_bedrock_model
-from .config import get_error_analyzer_config
 from .tools import (
     cloudwatch_document_logs,
     cloudwatch_logs,
@@ -29,9 +30,7 @@
 
 
 def create_error_analyzer_agent(
-    config: Dict[str, Any] = None,
-    session: boto3.Session = None,
-    pattern_config: Dict[str, Any] = None,
+    session: Optional[boto3.Session] = None,
     **kwargs,
 ) -> strands.Agent:
     """
@@ -44,7 +43,7 @@ def create_error_analyzer_agent(
         pattern_config: Pattern configuration containing agents section
         **kwargs: Additional arguments
     """
-    config = get_error_analyzer_config(pattern_config)
+    config = get_config(as_model=True)
 
     # Create session if not provided
     if session is None:
@@ -63,9 +62,11 @@ def create_error_analyzer_agent(
         xray_performance_analysis,
     ]
     bedrock_model = create_strands_bedrock_model(
-        model_id=config["model_id"], boto_session=session
+        model_id=config.agents.error_analyzer.model_id, boto_session=session
     )
 
     return strands.Agent(
-        tools=tools, system_prompt=config["system_prompt"], model=bedrock_model
+        tools=tools,
+        system_prompt=config.agents.error_analyzer.system_prompt,
+        model=bedrock_model,
     )
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/config.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/config.py
@@ -6,99 +6,11 @@
 """
 
 import logging
-from typing import Any, Dict, List
-
-from ..common.config import configure_logging, get_environment_config
+from typing import Any, Dict
 
 logger = logging.getLogger(__name__)
 
 
-def get_error_analyzer_config(pattern_config: Dict[str, Any] = None) -> Dict[str, Any]:
-    """
-    Builds complete error analyzer configuration from environment and patterns.
-    Get error analyzer configuration with defaults and overrides.
-
-    Returns:
-        Dict containing complete error analyzer configuration
-    """
-    from ... import get_config
-
-    # Start with base environment and context limits
-    config = get_environment_config(["CLOUDWATCH_LOG_GROUP_PREFIX", "AWS_STACK_NAME"])
-    config.update(get_context_limits())
-
-    # Load and apply agent configuration
-    full_config = get_config()
-    agent_config = full_config.get("agents", {}).get("error_analyzer", {})
-
-    if not agent_config:
-        raise ValueError("error_analyzer configuration not found")
-
-    # Apply agent settings with defaults
-    config.update(
-        {
-            "model_id": agent_config.get(
-                "model_id", "anthropic.claude-3-sonnet-20240229-v1:0"
-            ),
-            "system_prompt": agent_config.get("system_prompt"),
-            "error_patterns": get_default_error_patterns(),
-            "aws_capabilities": get_aws_service_capabilities(),
-        }
-    )
-
-    # Apply parameters with type conversion
-    params = agent_config.get("parameters", {})
-    config["max_log_events"] = safe_int_conversion(params.get("max_log_events"), 5)
-    config["time_range_hours_default"] = safe_int_conversion(
-        params.get("time_range_hours_default"), 24
-    )
-
-    # Apply UI overrides for context limits - UI config takes precedence
-    if pattern_config and "max_log_events" in pattern_config:
-        config["max_log_events"] = safe_int_conversion(
-            pattern_config["max_log_events"], config["max_log_events"]
-        )
-
-    # Validate required fields
-    if not config.get("system_prompt"):
-        raise ValueError("system_prompt is required")
-
-    configure_logging(
-        log_level=config.get("log_level"),
-        strands_log_level=config.get("strands_log_level"),
-    )
-
-    return config
-
-
-def get_default_error_patterns() -> List[str]:
-    """Returns standard error patterns for CloudWatch log filtering."""
-    return [
-        "ERROR",
-        "CRITICAL",
-        "FATAL",
-        "Exception",
-        "Traceback",
-        "Failed",
-        "Timeout",
-        "AccessDenied",
-        "ThrottlingException",
-    ]
-
-
-def get_context_limits() -> Dict[str, int]:
-    """Returns default resource and context size constraints."""
-    return {
-        "max_log_events": 5,
-        "max_log_message_length": 400,
-        "max_events_per_log_group": 5,
-        "max_log_groups": 20,
-        "max_stepfunction_timeline_events": 3,
-        "max_stepfunction_error_length": 400,
-        "time_range_hours_default": 24,
-    }
-
-
 def get_aws_service_capabilities() -> Dict[str, Any]:
     """Returns AWS service integration metadata and descriptions."""
     return {
@@ -161,12 +73,3 @@ def truncate_message(message: str, max_length: int = 200) -> str:
     if len(message) <= max_length:
         return message
     return message[:max_length] + "... [truncated]"
-
-
-def get_config_with_fallback() -> Dict[str, Any]:
-    """Gets error analyzer config with graceful fallback to defaults."""
-    try:
-        return get_error_analyzer_config()
-    except Exception as e:
-        logger.warning(f"Failed to load config, using defaults: {e}")
-        return get_context_limits()
diff --git a/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/stepfunction_tool.py b/lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/stepfunction_tool.py
@@ -11,10 +11,11 @@
 import boto3
 from strands import tool
 
+from idp_common.config import get_config
+
 from ..config import (
     create_error_response,
     create_response,
-    get_config_with_fallback,
 )
 
 logger = logging.getLogger(__name__)
@@ -98,8 +99,11 @@ def _analyze_execution_timeline(events: List[Dict[str, Any]]) -> Dict[str, Any]:
         return {"error": "No execution events available"}
 
     # Cache config values once
-    config = get_config_with_fallback()
-    max_timeline_events = config.get("max_stepfunction_timeline_events", 3)
+    config = get_config(as_model=True)
+
+    max_timeline_events = (
+        config.agents.error_analyzer.parameters.max_stepfunction_timeline_events
+    )
 
     timeline = []
     failure_point = None
diff --git a/lib/idp_common_pkg/idp_common/config/__init__.py b/lib/idp_common_pkg/idp_common/config/__init__.py
@@ -15,6 +15,7 @@
     ExtractionConfig,
     ClassificationConfig,
     AssessmentConfig,
+    SchemaConfig,
     SummarizationConfig,
     OCRConfig,
     AgenticConfig,
@@ -42,9 +43,19 @@ def __init__(self, table_name=None):
         self.manager = ConfigurationManager(table_name)
         logger.info(f"Initialized ConfigurationReader with ConfigurationManager")
 
+    @overload
     def get_configuration(
-        self, config_type: str, as_dict: bool = True
-    ) -> Optional[Dict[str, Any]]:
+        self, config_type: str, *, as_dict: Literal[True]
+    ) -> Optional[Dict[str, Any]]: ...
+
+    @overload
+    def get_configuration(
+        self, config_type: str, *, as_dict: Literal[False]
+    ) -> Optional[Union[IDPConfig, SchemaConfig]]: ...
+
+    def get_configuration(
+        self, config_type: str, *, as_dict: bool = True
+    ) -> Optional[Union[Dict[str, Any], IDPConfig, SchemaConfig]]:
         """
         Retrieve a configuration item from DynamoDB with automatic migration
 
@@ -91,9 +102,17 @@ def simple_merge(
         merged = deepcopy(default)
         return deep_update(merged, custom)
 
+    @overload
+    def get_merged_configuration(self, *, as_model: Literal[True]) -> IDPConfig: ...
+
+    @overload
     def get_merged_configuration(
-        self, as_model: bool = False
-    ) -> Union[Dict[str, Any], IDPConfig]:
+        self, *, as_model: Literal[False]
+    ) -> Dict[str, Any]: ...
+
+    def get_merged_configuration(
+        self, *, as_model: bool = False
+    ) -> Union[IDPConfig, Dict[str, Any]]:
         """
         Get and merge Default and Custom configurations with automatic migration
 
@@ -139,7 +158,21 @@ def get_merged_configuration(
             raise
 
 
-def get_config(table_name=None, as_model: bool = False):
+@overload
+def get_config(
+    table_name: Optional[str] = None, *, as_model: Literal[True]
+) -> IDPConfig: ...
+
+
+@overload
+def get_config(
+    table_name: Optional[str] = None, *, as_model: Literal[False]
+) -> Dict[str, Any]: ...
+
+
+def get_config(
+    table_name: Optional[str] = None, *, as_model: bool = False
+) -> Union[IDPConfig, Dict[str, Any]]:
     """
     Get the merged configuration using the environment variable for table name
 
diff --git a/lib/idp_common_pkg/idp_common/config/models.py b/lib/idp_common_pkg/idp_common/config/models.py
@@ -304,10 +304,16 @@ def parse_max_workers(cls, v: Any) -> int:
 
 class ErrorAnalyzerParameters(BaseModel):
     """Error analyzer parameters configuration"""
-
+    
     max_log_events: int = Field(default=5, gt=0, description="Maximum number of log events to retrieve")
     time_range_hours_default: int = Field(default=24, gt=0, description="Default time range in hours for log searches")
 
+    max_log_message_length : int = 400 
+    max_events_per_log_group : int = 5
+    max_log_groups : int = 20
+    max_stepfunction_timeline_events : int = 3
+    max_stepfunction_error_length : int = 400 
+
     @field_validator("max_log_events", "time_range_hours_default", mode="before")
     @classmethod
     def parse_int(cls, v: Any) -> int:
@@ -333,11 +339,115 @@ class ErrorAnalyzerConfig(BaseModel):
         description="Error analyzer parameters"
     )
 
+    error_patterns: list[str] = [
+        "ERROR",
+        "CRITICAL",
+        "FATAL",
+        "Exception",
+        "Traceback",
+        "Failed",
+        "Timeout",
+        "AccessDenied",
+        "ThrottlingException",
+    ]
+    system_prompt: str = Field(
+        default="""
+            You are an intelligent error analysis agent for the GenAI IDP system with access to specialized diagnostic tools.
+
+              GENERAL TROUBLESHOOTING WORKFLOW:
+              1. Identify document status from DynamoDB
+                  2. Find any errors reported during Step Function execution
+              3. Collect relevant logs from CloudWatch
+              4. Identify any performance issues from X-Ray traces
+          5. Provide root cause analysis based on the collected information
+          
+          TOOL SELECTION STRATEGY:
+          - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis
+          - For system-wide issues: Use cloudwatch_logs and dynamodb_query
+          - For execution context: Use lambda_lookup or stepfunction_details
+          - For distributed tracing: Use xray_trace or xray_performance_analysis
+          
+          ALWAYS format your response with exactly these three sections in this order:
+          
+          ## Root Cause
+          Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms.
+
+          ## Recommendations
+              Provide specific, actionable steps to resolve the issue. Limit to top three recommendations only.
+
+          <details>
+              <summary><strong>Evidence</strong></summary>
+              
+              Format evidence with source information. Include relevant data from tool responses:
+              
+              **For CloudWatch logs:**
+                  **Log Group:** [full log_group name]
+              **Log Stream:** [full log_stream name]
+                  ```
+              [ERROR] timestamp message
+          ```
+          
+          **For other sources (DynamoDB, Step Functions, X-Ray):**
+              **Source:** [service name and resource]
+              ```
+          Relevant data from tool response
+              ```
+
+          </details>
+
+              FORMATTING RULES:
+          - Use the exact three-section structure above
+          - Make Evidence section collapsible using HTML details tags
+          - Include relevant data from all tool responses (CloudWatch, DynamoDB, Step Functions, X-Ray)
+          - For CloudWatch: Show complete log group and log stream names without truncation
+          - Present evidence data in code blocks with appropriate source labels
+                
+              ANALYSIS GUIDELINES:
+          - Use multiple tools for comprehensive analysis when needed
+              - Start with document-specific tools for targeted queries
+              - Use system-wide tools for pattern analysis
+              - Combine DynamoDB status with CloudWatch logs for complete picture
+              - Leverage X-Ray for distributed system issues
+                  
+                  ROOT CAUSE DETERMINATION:
+                  1. Document Status: Check dynamodb_status first
+              2. Execution Details: Use stepfunction_details for workflow failures
+              3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details
+              4. Distributed Tracing: Use xray_performance_analysis for service interaction issues
+              5. Context: Use lambda_lookup for execution environment
+              
+              RECOMMENDATION GUIDELINES:
+              For code-related issues or system bugs:
+                  - Do not suggest code modifications
+              - Include error details, timestamps, and context
+
+              For configuration-related issues:
+                  - Direct users to UI configuration panel
+                      - Specify exact configuration section and parameter names
+
+                      For operational issues:
+                      - Provide immediate troubleshooting steps
+                      - Include preventive measures
+
+                      TIME RANGE PARSING:
+                      - recent: 1 hour
+              - last week: 168 hours  
+                      - last day: 24 hours
+                      - No time specified: 24 hours (default)
+              
+              IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence.""", 
+        description="System prompt for error analyzer"
+    )
+    parameters: ErrorAnalyzerParameters = Field(
+        default_factory=ErrorAnalyzerParameters,
+        description="Error analyzer parameters"
+    )
+
 
 class AgentsConfig(BaseModel):
     """Agents configuration"""
 
-    error_analyzer: Optional[ErrorAnalyzerConfig] = Field(default=None)
+    error_analyzer: ErrorAnalyzerConfig = Field(default=ErrorAnalyzerConfig())
 
 
 class PricingUnit(BaseModel):