Skip to content

Commit be6c249

Browse files
committed
Updated error analyzer tools
1 parent a8a9f1b commit be6c249

File tree

2 files changed

+106
-46
lines changed

2 files changed

+106
-46
lines changed

lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/cloudwatch_tool.py

Lines changed: 106 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -150,32 +150,58 @@ def cloudwatch_document_logs(
150150
"Unknown",
151151
)
152152

153-
for log_group in groups_to_search:
154-
search_result = _search_cloudwatch_logs(
155-
log_group_name=log_group["name"],
156-
filter_pattern=filter_pattern,
157-
max_events=max_log_events,
158-
start_time=document_start_time,
159-
end_time=document_end_time,
160-
request_id=request_id,
161-
)
153+
# Extract function type from Lambda function name (e.g., "ClassificationFunction")
154+
function_type = _extract_function_type(function_name)
155+
156+
# Find matching log group for this function type
157+
matching_log_groups = (
158+
[
159+
lg
160+
for lg in groups_to_search
161+
if function_type and function_type in lg["name"]
162+
]
163+
if function_type
164+
else []
165+
)
162166

163-
if search_result.get("events_found", 0) > 0:
164-
search_method_used = "lambda_request_id"
167+
# Only search the specific matching log group for this function's request ID
168+
log_groups_to_search = matching_log_groups
169+
170+
if log_groups_to_search:
171+
for log_group in log_groups_to_search:
165172
logger.info(
166-
f"Found {search_result['events_found']} error events in {log_group['name']} for Lambda function {function_name} using request ID {request_id}"
173+
f"Searching log group {log_group['name']} for Lambda function {function_name} ({function_type}) with request ID {request_id}"
167174
)
168-
all_results.append(
169-
{
170-
"log_group": log_group["name"],
171-
"lambda_function_name": function_name,
172-
"request_id": request_id,
173-
"search_method": "lambda_request_id",
174-
"events_found": search_result["events_found"],
175-
"events": search_result["events"],
176-
}
175+
# Use ERROR pattern and filter by request ID in post-processing
176+
search_result = _search_cloudwatch_logs(
177+
log_group_name=log_group["name"],
178+
filter_pattern="ERROR", # Search for errors, filter by request ID later
179+
max_events=max_log_events * 3, # Get more events to filter
180+
start_time=document_start_time,
181+
end_time=document_end_time,
182+
request_id=request_id,
177183
)
178-
total_events += search_result["events_found"]
184+
185+
if search_result.get("events_found", 0) > 0:
186+
search_method_used = "lambda_request_id"
187+
logger.info(
188+
f"Found {search_result['events_found']} error events in {log_group['name']} for Lambda function {function_name} using request ID {request_id}"
189+
)
190+
all_results.append(
191+
{
192+
"log_group": log_group["name"],
193+
"lambda_function_name": function_name,
194+
"request_id": request_id,
195+
"search_method": "lambda_request_id",
196+
"events_found": search_result["events_found"],
197+
"events": search_result["events"],
198+
}
199+
)
200+
total_events += search_result["events_found"]
201+
else:
202+
logger.info(
203+
f"No matching log group found for Lambda function {function_name} ({function_type})"
204+
)
179205

180206
# Stop if we found errors from the first (likely failed) function
181207
if total_events > 0:
@@ -352,9 +378,6 @@ def extract_error_keywords(log_events: List[LogEvent]) -> Dict[str, int]:
352378
"timeout",
353379
"fatal",
354380
"critical",
355-
"panic",
356-
"abort",
357-
"crash",
358381
"denied",
359382
"refused",
360383
]
@@ -418,13 +441,25 @@ def _search_cloudwatch_logs(
418441
if final_filter_pattern:
419442
params["filterPattern"] = final_filter_pattern
420443

444+
logger.info(
445+
f"CloudWatch search params for {log_group_name}: filter='{final_filter_pattern}', request_id={request_id}"
446+
)
447+
421448
response = client.filter_log_events(**params)
449+
logger.info(
450+
f"CloudWatch API returned {len(response.get('events', []))} raw events for {log_group_name}"
451+
)
422452

423453
events = []
424454
for event in response.get("events", []):
425455
message = event["message"]
426456
if _should_exclude_log_event(message, filter_pattern):
427457
continue
458+
459+
# When using request ID search, only include events with matching request ID
460+
if request_id and request_id not in message:
461+
continue
462+
428463
events.append(
429464
{
430465
"timestamp": datetime.fromtimestamp(
@@ -450,28 +485,13 @@ def _search_cloudwatch_logs(
450485

451486
def _build_filter_pattern(base_pattern: str, request_id: str = None) -> str:
452487
"""
453-
Build CloudWatch filter pattern combining request ID and error keywords.
454-
455-
Args:
456-
base_pattern: Base filter pattern (e.g., "ERROR")
457-
request_id: Lambda request ID for precise filtering
458-
459-
Returns:
460-
Optimized filter pattern string
488+
Build CloudWatch filter pattern. Use ERROR pattern and filter by request ID in post-processing.
461489
"""
462-
if request_id and base_pattern:
463-
# Use CloudWatch filter syntax: both request_id AND error pattern must be present
464-
sanitized_pattern = base_pattern.replace(":", "")
465-
combined_pattern = f"{request_id} {sanitized_pattern}"
466-
logger.debug(f"Building combined filter pattern: {combined_pattern}")
467-
return combined_pattern
468-
elif request_id:
469-
logger.debug(f"Building filter pattern with request ID: {request_id}")
470-
return request_id
490+
if request_id:
491+
# Use ERROR pattern, will filter by request ID in post-processing
492+
return base_pattern if base_pattern else "ERROR"
471493
elif base_pattern:
472-
sanitized_pattern = base_pattern.replace(":", "")
473-
logger.debug(f"Building filter pattern with base pattern: {sanitized_pattern}")
474-
return sanitized_pattern
494+
return base_pattern
475495
else:
476496
return ""
477497

@@ -585,6 +605,47 @@ def _get_log_group_prefix(stack_name: str) -> Dict[str, Any]:
585605
return create_error_response(str(e), stack_name=stack_name)
586606

587607

608+
def _extract_function_type(lambda_function_name: str) -> str:
609+
"""
610+
Extract function type from Lambda function name using pattern matching.
611+
612+
Examples:
613+
- DEV-P2-EA8-PATTERN2STACK-1H-ClassificationFunction-dSp68ELdR85C -> ClassificationFunction
614+
- DEV-P2-EA8-PATTERN2STACK-1HHT2VDXH7MW0-OCRFunction-EQ6aqmcsC4XO -> OCRFunction
615+
- DEV-P2-EA8-QueueProcessor-JweFNlBa4vkV -> QueueProcessor
616+
"""
617+
if not lambda_function_name:
618+
return ""
619+
620+
# Split by hyphens and look for parts ending with "Function" or "Processor"
621+
parts = lambda_function_name.split("-")
622+
623+
for part in parts:
624+
# Look for parts ending with common Lambda function suffixes
625+
if part.endswith(("Function", "Processor")) and len(part) > 8:
626+
return part
627+
628+
return ""
629+
630+
631+
def _is_error_event(message: str) -> bool:
632+
"""
633+
Check if a log message is an error event.
634+
"""
635+
message_upper = message.upper()
636+
error_indicators = [
637+
"[ERROR]",
638+
"ERROR:",
639+
"EXCEPTION",
640+
"FAILED",
641+
"FAILURE",
642+
"TIMEOUT",
643+
"FATAL",
644+
"CRITICAL",
645+
]
646+
return any(indicator in message_upper for indicator in error_indicators)
647+
648+
588649
def _should_exclude_log_event(message: str, filter_pattern: str = "") -> bool:
589650
"""
590651
Filter out noise from log events while preserving relevant error information.

lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/xray_tool.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,6 @@ def extract_lambda_request_ids(xray_trace_id: str) -> Dict[str, str]:
303303

304304
try:
305305
response = xray_client.batch_get_traces(TraceIds=[xray_trace_id])
306-
logger.info(f"X-Ray batch_get_traces response for {xray_trace_id}: {response}")
307306

308307
traces = response.get("Traces", [])
309308
if not traces:

0 commit comments

Comments
 (0)