Skip to content

Commit 1a98cd6

Browse files
committed
Updated xray tracing tool
1 parent c3b910f commit 1a98cd6

File tree

5 files changed

+139
-186
lines changed

5 files changed

+139
-186
lines changed

lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/cloudwatch_tool.py

Lines changed: 49 additions & 184 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77

88
import logging
99
import os
10-
import re
1110
from datetime import datetime, timedelta
12-
from typing import Any, Dict, List, Optional
11+
from typing import Any, Dict
1312

1413
import boto3
1514
from strands import tool
1615

1716
from ..config import create_error_response, safe_int_conversion
1817
from .lambda_tool import lambda_document_context
18+
from .xray_tool import extract_lambda_request_ids
1919

2020
logger = logging.getLogger(__name__)
2121

@@ -129,8 +129,7 @@ def search_cloudwatch_logs(
129129

130130
def _build_filter_pattern(base_pattern: str, request_id: str = None) -> str:
131131
"""
132-
Build CloudWatch filter pattern with request ID priority.
133-
Uses request ID alone first for maximum precision, then combines with error patterns.
132+
Build CloudWatch filter pattern combining request ID and error keywords.
134133
135134
Args:
136135
base_pattern: Base filter pattern (e.g., "ERROR")
@@ -139,12 +138,16 @@ def _build_filter_pattern(base_pattern: str, request_id: str = None) -> str:
139138
Returns:
140139
Optimized filter pattern string
141140
"""
142-
if request_id:
143-
# Use request ID alone for maximum precision
141+
if request_id and base_pattern:
142+
# Combine request ID with error pattern for precise error filtering
143+
sanitized_pattern = base_pattern.replace(":", "")
144+
combined_pattern = f"[{request_id}, {sanitized_pattern}]"
145+
logger.debug(f"Building combined filter pattern: {combined_pattern}")
146+
return combined_pattern
147+
elif request_id:
144148
logger.debug(f"Building filter pattern with request ID: {request_id}")
145149
return request_id
146150
elif base_pattern:
147-
# Fallback to base pattern only
148151
sanitized_pattern = base_pattern.replace(":", "")
149152
logger.debug(f"Building filter pattern with base pattern: {sanitized_pattern}")
150153
return sanitized_pattern
@@ -217,126 +220,6 @@ def _extract_prefix_from_state_machine_arn(arn: str) -> str:
217220
return ""
218221

219222

220-
def extract_request_ids_from_logs(
221-
log_groups: List[str], execution_id: str, start_time: datetime, end_time: datetime
222-
) -> Dict[str, Any]:
223-
"""
224-
Extract Lambda request IDs from CloudWatch logs using execution ID correlation.
225-
Searches CloudWatch logs for the execution ID and extracts associated request IDs.
226-
227-
Args:
228-
log_groups: List of log group names to search
229-
execution_id: Step Functions execution ID for correlation
230-
start_time: Start time for log search
231-
end_time: End time for log search
232-
233-
Returns:
234-
Dict containing function-to-request-ID mapping and extraction metadata
235-
"""
236-
function_request_map = {}
237-
all_request_ids = []
238-
239-
client = boto3.client("logs")
240-
logger.info(
241-
f"Extracting request IDs from {len(log_groups)} log groups using execution ID: {execution_id}"
242-
)
243-
244-
for log_group in log_groups[:5]: # Limit to first 5 groups for performance
245-
try:
246-
# Search for all logs in the time window (no filter pattern)
247-
# We'll extract request IDs from any logs in the execution timeframe
248-
response = client.filter_log_events(
249-
logGroupName=log_group,
250-
startTime=int(start_time.timestamp() * 1000),
251-
endTime=int(end_time.timestamp() * 1000),
252-
limit=50, # Increased limit to find request IDs
253-
)
254-
255-
for event in response.get("events", []):
256-
message = event["message"]
257-
258-
# Extract request ID from log message
259-
request_id = _extract_request_id_from_log_message(message)
260-
if request_id:
261-
# Extract function name from log group
262-
function_name = _extract_function_name_from_log_group(log_group)
263-
264-
if function_name and request_id not in all_request_ids:
265-
function_request_map[function_name] = request_id
266-
all_request_ids.append(request_id)
267-
logger.info(
268-
f"Extracted request ID '{request_id}' for function '{function_name}' from CloudWatch logs"
269-
)
270-
logger.debug(f"Request ID found in message: {message[:200]}...")
271-
break # One request ID per function is sufficient
272-
273-
except Exception as e:
274-
logger.debug(f"Failed to search log group {log_group}: {e}")
275-
continue
276-
277-
logger.info(
278-
f"CloudWatch extraction found {len(function_request_map)} function-request mappings"
279-
)
280-
return {
281-
"function_request_map": function_request_map,
282-
"all_request_ids": list(set(all_request_ids)),
283-
"extraction_method": "cloudwatch_logs",
284-
"extraction_success": len(all_request_ids) > 0,
285-
}
286-
287-
288-
def _extract_request_id_from_log_message(message: str) -> Optional[str]:
289-
"""
290-
Extract Lambda request ID from CloudWatch log message.
291-
Lambda logs format: [LEVEL] timestamp request_id message
292-
293-
Args:
294-
message: CloudWatch log message
295-
296-
Returns:
297-
Request ID string if found, None otherwise
298-
"""
299-
if not message:
300-
return None
301-
302-
# Pattern for Lambda request ID in log messages
303-
# Format: [INFO] 2025-10-22T18:35:40.357Z 1386c0d2-a9d1-4169-940a-8d35c8899e27 message
304-
pattern = r"\[\w+\]\s+\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s+([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})"
305-
306-
match = re.search(pattern, message)
307-
if match:
308-
return match.group(1)
309-
310-
# Alternative pattern for different log formats - look for any UUID
311-
uuid_pattern = r"([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})"
312-
matches = re.findall(uuid_pattern, message, re.IGNORECASE)
313-
314-
# Return first UUID that looks like a request ID (not execution ID)
315-
for match in matches:
316-
if len(match) == 36: # Standard UUID length
317-
return match
318-
319-
return None
320-
321-
322-
def _extract_function_name_from_log_group(log_group: str) -> str:
323-
"""
324-
Extract Lambda function name from log group name.
325-
326-
Args:
327-
log_group: CloudWatch log group name
328-
329-
Returns:
330-
Function name string
331-
"""
332-
# Log group format: /aws/lambda/FunctionName or /prefix/lambda/FunctionName
333-
if "/lambda/" in log_group:
334-
return log_group.split("/lambda/")[-1]
335-
336-
# Fallback: use last part of log group name
337-
return log_group.split("/")[-1] if "/" in log_group else log_group
338-
339-
340223
def get_log_group_prefix(stack_name: str) -> Dict[str, Any]:
341224
"""
342225
Determines CloudWatch log group prefix from CloudFormation stack.
@@ -472,64 +355,23 @@ def cloudwatch_document_logs(
472355
f"Using time window with {buffer.total_seconds()}s buffer for batch operation isolation"
473356
)
474357

475-
# Enhanced search strategy with request ID priority
476-
request_ids = context.get("lambda_request_ids", [])
477-
function_request_map = context.get("function_request_map", {})
478-
failed_functions = context.get("failed_functions", [])
479-
primary_failed_function = context.get("primary_failed_function")
480-
execution_arn = context.get("execution_arn")
481-
execution_events_count = context.get("execution_events_count", 0)
482-
483-
logger.info(
484-
f"Step Functions extraction - Total request IDs: {len(request_ids)}, Failed functions: {len(failed_functions)}, Events: {execution_events_count}"
485-
)
486-
logger.info(
487-
f"CloudWatch extraction conditions - request_ids: {len(request_ids)}, execution_arn: {bool(execution_arn)}, start_time: {bool(start_time)}, end_time: {bool(end_time)}"
488-
)
358+
# X-Ray based request ID extraction
359+
trace_id = context.get("trace_id")
360+
function_request_map = {}
489361

490-
# NEW: CloudWatch-based request ID extraction if Step Functions extraction failed
491-
cloudwatch_extraction_used = False
492-
if len(request_ids) == 0 and execution_arn and start_time and end_time:
362+
if trace_id:
363+
logger.info(f"Extracting Lambda request IDs from X-Ray trace: {trace_id}")
364+
function_request_map = extract_lambda_request_ids(trace_id)
493365
logger.info(
494-
"Step Functions extraction yielded 0 request IDs, attempting CloudWatch log extraction"
495-
)
496-
497-
# Get log group names for extraction
498-
group_names = [g["name"] for g in log_groups.get("log_groups", [])]
499-
execution_id = execution_arn.split(":")[-1]
500-
501-
cloudwatch_extraction = extract_request_ids_from_logs(
502-
group_names, execution_id, start_time, end_time
503-
)
504-
505-
if cloudwatch_extraction.get("extraction_success"):
506-
# Override Step Functions results with CloudWatch extraction
507-
function_request_map = cloudwatch_extraction.get(
508-
"function_request_map", {}
509-
)
510-
request_ids = cloudwatch_extraction.get("all_request_ids", [])
511-
cloudwatch_extraction_used = True
512-
logger.info(
513-
f"CloudWatch extraction successful - Found {len(request_ids)} request IDs from {len(function_request_map)} functions"
514-
)
515-
else:
516-
logger.warning("CloudWatch extraction also failed to find request IDs")
517-
elif len(request_ids) == 0:
518-
logger.warning(
519-
f"CloudWatch extraction not attempted - missing conditions: execution_arn={bool(execution_arn)}, start_time={bool(start_time)}, end_time={bool(end_time)}"
366+
f"X-Ray extraction found {len(function_request_map)} Lambda functions: {function_request_map}"
520367
)
521368
else:
522-
logger.info(
523-
f"CloudWatch extraction not needed - Step Functions found {len(request_ids)} request IDs"
524-
)
369+
logger.warning("No trace_id found in document context")
525370

526-
logger.info(
527-
f"Total request IDs: {len(request_ids)}, Function mappings: {len(function_request_map)}"
528-
)
529-
logger.info(f"Function request mapping: {function_request_map}")
530-
logger.info(
531-
f"Extraction method: {'CloudWatch logs' if cloudwatch_extraction_used else 'Step Functions events'}"
532-
)
371+
request_ids = list(function_request_map.values())
372+
failed_functions = context.get("failed_functions", [])
373+
primary_failed_function = context.get("primary_failed_function")
374+
execution_arn = context.get("execution_arn")
533375

534376
# Priority 1: Request IDs from failed functions (highest priority)
535377
failed_function_request_ids = []
@@ -574,6 +416,19 @@ def cloudwatch_document_logs(
574416

575417
# Search with failed function request IDs (highest priority)
576418
for request_id in search_strategy["failed_function_request_ids"]:
419+
# Find function name for this request ID
420+
function_name = next(
421+
(
422+
func
423+
for func, rid in function_request_map.items()
424+
if rid == request_id
425+
),
426+
"Unknown",
427+
)
428+
logger.info(
429+
f"Filtering logs with Lambda function: {function_name}, request_id: {request_id}"
430+
)
431+
577432
for group in groups_to_search:
578433
log_group_name = group["name"]
579434
search_result = search_cloudwatch_logs(
@@ -616,6 +471,19 @@ def cloudwatch_document_logs(
616471
for request_id in search_strategy["other_request_ids"][
617472
:3
618473
]: # Limit to first 3
474+
# Find function name for this request ID
475+
function_name = next(
476+
(
477+
func
478+
for func, rid in function_request_map.items()
479+
if rid == request_id
480+
),
481+
"Unknown",
482+
)
483+
logger.info(
484+
f"Filtering logs with Lambda function: {function_name}, request_id: {request_id}"
485+
)
486+
619487
for group in groups_to_search:
620488
log_group_name = group["name"]
621489

@@ -776,10 +644,7 @@ def cloudwatch_document_logs(
776644
"document_status": context.get("document_status"),
777645
"execution_arn": execution_arn,
778646
"search_strategy": search_strategy,
779-
"cloudwatch_extraction_used": cloudwatch_extraction_used,
780-
"extraction_method": "cloudwatch_logs"
781-
if cloudwatch_extraction_used
782-
else "step_functions",
647+
"extraction_method": "xray_trace",
783648
"failed_functions": failed_functions,
784649
"primary_failed_function": primary_failed_function,
785650
"processing_time_window": {

lib/idp_common_pkg/idp_common/agents/error_analyzer/tools/xray_tool.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
X-Ray tools for tracing analysis and performance monitoring.
66
"""
77

8+
import json
89
import logging
910
from datetime import datetime, timedelta
1011
from typing import Any, Dict, List
@@ -123,6 +124,79 @@ def _analyze_trace_segments(segments: List[Dict[str, Any]]) -> Dict[str, Any]:
123124
}
124125

125126

127+
def _parse_segment_for_lambda(segment: Dict[str, Any]) -> List[Dict[str, Any]]:
128+
"""
129+
Recursively parse segment for Lambda executions.
130+
131+
Args:
132+
segment: X-Ray segment document
133+
134+
Returns:
135+
List of Lambda execution details
136+
"""
137+
lambda_executions = []
138+
139+
if segment.get("origin") == "AWS::Lambda":
140+
aws_info = segment.get("aws", {})
141+
function_name = segment.get("name", "Unknown")
142+
143+
if "resource_arn" in segment:
144+
function_name = segment["resource_arn"].split(":")[-1]
145+
146+
lambda_executions.append(
147+
{
148+
"function_name": function_name,
149+
"request_id": aws_info.get("request_id"),
150+
}
151+
)
152+
153+
for subsegment in segment.get("subsegments", []):
154+
lambda_executions.extend(_parse_segment_for_lambda(subsegment))
155+
156+
return lambda_executions
157+
158+
159+
def extract_lambda_request_ids(trace_id: str) -> Dict[str, str]:
160+
"""
161+
Extract Lambda request IDs from X-Ray trace.
162+
163+
Args:
164+
trace_id: X-Ray trace ID
165+
166+
Returns:
167+
Dict mapping Lambda function names to their CloudWatch request IDs
168+
"""
169+
xray_client = boto3.client("xray")
170+
171+
try:
172+
response = xray_client.batch_get_traces(TraceIds=[trace_id])
173+
traces = response.get("Traces", [])
174+
175+
if not traces:
176+
return {}
177+
178+
lambda_executions = []
179+
for trace in traces:
180+
for segment in trace.get("Segments", []):
181+
try:
182+
segment_doc = json.loads(segment["Document"])
183+
lambda_executions.extend(_parse_segment_for_lambda(segment_doc))
184+
except json.JSONDecodeError:
185+
continue
186+
187+
# Convert to function_name -> request_id mapping
188+
result = {}
189+
for execution in lambda_executions:
190+
if execution["request_id"]:
191+
result[execution["function_name"]] = execution["request_id"]
192+
193+
return result
194+
195+
except Exception as e:
196+
logger.error(f"Error extracting Lambda request IDs: {e}")
197+
return {}
198+
199+
126200
@tool
127201
def xray_service_map(
128202
service_name: str = None, hours_back: int = None

0 commit comments

Comments
 (0)