77
88import logging
99import os
10- import re
1110from datetime import datetime , timedelta
12- from typing import Any , Dict , List , Optional
11+ from typing import Any , Dict
1312
1413import boto3
1514from strands import tool
1615
1716from ..config import create_error_response , safe_int_conversion
1817from .lambda_tool import lambda_document_context
18+ from .xray_tool import extract_lambda_request_ids
1919
2020logger = logging .getLogger (__name__ )
2121
@@ -129,8 +129,7 @@ def search_cloudwatch_logs(
129129
130130def _build_filter_pattern (base_pattern : str , request_id : str = None ) -> str :
131131 """
132- Build CloudWatch filter pattern with request ID priority.
133- Uses request ID alone first for maximum precision, then combines with error patterns.
132+ Build CloudWatch filter pattern combining request ID and error keywords.
134133
135134 Args:
136135 base_pattern: Base filter pattern (e.g., "ERROR")
@@ -139,12 +138,16 @@ def _build_filter_pattern(base_pattern: str, request_id: str = None) -> str:
139138 Returns:
140139 Optimized filter pattern string
141140 """
142- if request_id :
143- # Use request ID alone for maximum precision
141+ if request_id and base_pattern :
142+ # Combine request ID with error pattern for precise error filtering
143+ sanitized_pattern = base_pattern .replace (":" , "" )
144+ combined_pattern = f"[{ request_id } , { sanitized_pattern } ]"
145+ logger .debug (f"Building combined filter pattern: { combined_pattern } " )
146+ return combined_pattern
147+ elif request_id :
144148 logger .debug (f"Building filter pattern with request ID: { request_id } " )
145149 return request_id
146150 elif base_pattern :
147- # Fallback to base pattern only
148151 sanitized_pattern = base_pattern .replace (":" , "" )
149152 logger .debug (f"Building filter pattern with base pattern: { sanitized_pattern } " )
150153 return sanitized_pattern
@@ -217,126 +220,6 @@ def _extract_prefix_from_state_machine_arn(arn: str) -> str:
217220 return ""
218221
219222
220- def extract_request_ids_from_logs (
221- log_groups : List [str ], execution_id : str , start_time : datetime , end_time : datetime
222- ) -> Dict [str , Any ]:
223- """
224- Extract Lambda request IDs from CloudWatch logs using execution ID correlation.
225- Searches CloudWatch logs for the execution ID and extracts associated request IDs.
226-
227- Args:
228- log_groups: List of log group names to search
229- execution_id: Step Functions execution ID for correlation
230- start_time: Start time for log search
231- end_time: End time for log search
232-
233- Returns:
234- Dict containing function-to-request-ID mapping and extraction metadata
235- """
236- function_request_map = {}
237- all_request_ids = []
238-
239- client = boto3 .client ("logs" )
240- logger .info (
241- f"Extracting request IDs from { len (log_groups )} log groups using execution ID: { execution_id } "
242- )
243-
244- for log_group in log_groups [:5 ]: # Limit to first 5 groups for performance
245- try :
246- # Search for all logs in the time window (no filter pattern)
247- # We'll extract request IDs from any logs in the execution timeframe
248- response = client .filter_log_events (
249- logGroupName = log_group ,
250- startTime = int (start_time .timestamp () * 1000 ),
251- endTime = int (end_time .timestamp () * 1000 ),
252- limit = 50 , # Increased limit to find request IDs
253- )
254-
255- for event in response .get ("events" , []):
256- message = event ["message" ]
257-
258- # Extract request ID from log message
259- request_id = _extract_request_id_from_log_message (message )
260- if request_id :
261- # Extract function name from log group
262- function_name = _extract_function_name_from_log_group (log_group )
263-
264- if function_name and request_id not in all_request_ids :
265- function_request_map [function_name ] = request_id
266- all_request_ids .append (request_id )
267- logger .info (
268- f"Extracted request ID '{ request_id } ' for function '{ function_name } ' from CloudWatch logs"
269- )
270- logger .debug (f"Request ID found in message: { message [:200 ]} ..." )
271- break # One request ID per function is sufficient
272-
273- except Exception as e :
274- logger .debug (f"Failed to search log group { log_group } : { e } " )
275- continue
276-
277- logger .info (
278- f"CloudWatch extraction found { len (function_request_map )} function-request mappings"
279- )
280- return {
281- "function_request_map" : function_request_map ,
282- "all_request_ids" : list (set (all_request_ids )),
283- "extraction_method" : "cloudwatch_logs" ,
284- "extraction_success" : len (all_request_ids ) > 0 ,
285- }
286-
287-
288- def _extract_request_id_from_log_message (message : str ) -> Optional [str ]:
289- """
290- Extract Lambda request ID from CloudWatch log message.
291- Lambda logs format: [LEVEL] timestamp request_id message
292-
293- Args:
294- message: CloudWatch log message
295-
296- Returns:
297- Request ID string if found, None otherwise
298- """
299- if not message :
300- return None
301-
302- # Pattern for Lambda request ID in log messages
303- # Format: [INFO] 2025-10-22T18:35:40.357Z 1386c0d2-a9d1-4169-940a-8d35c8899e27 message
304- pattern = r"\[\w+\]\s+\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s+([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})"
305-
306- match = re .search (pattern , message )
307- if match :
308- return match .group (1 )
309-
310- # Alternative pattern for different log formats - look for any UUID
311- uuid_pattern = r"([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})"
312- matches = re .findall (uuid_pattern , message , re .IGNORECASE )
313-
314- # Return first UUID that looks like a request ID (not execution ID)
315- for match in matches :
316- if len (match ) == 36 : # Standard UUID length
317- return match
318-
319- return None
320-
321-
322- def _extract_function_name_from_log_group (log_group : str ) -> str :
323- """
324- Extract Lambda function name from log group name.
325-
326- Args:
327- log_group: CloudWatch log group name
328-
329- Returns:
330- Function name string
331- """
332- # Log group format: /aws/lambda/FunctionName or /prefix/lambda/FunctionName
333- if "/lambda/" in log_group :
334- return log_group .split ("/lambda/" )[- 1 ]
335-
336- # Fallback: use last part of log group name
337- return log_group .split ("/" )[- 1 ] if "/" in log_group else log_group
338-
339-
340223def get_log_group_prefix (stack_name : str ) -> Dict [str , Any ]:
341224 """
342225 Determines CloudWatch log group prefix from CloudFormation stack.
@@ -472,64 +355,23 @@ def cloudwatch_document_logs(
472355 f"Using time window with { buffer .total_seconds ()} s buffer for batch operation isolation"
473356 )
474357
475- # Enhanced search strategy with request ID priority
476- request_ids = context .get ("lambda_request_ids" , [])
477- function_request_map = context .get ("function_request_map" , {})
478- failed_functions = context .get ("failed_functions" , [])
479- primary_failed_function = context .get ("primary_failed_function" )
480- execution_arn = context .get ("execution_arn" )
481- execution_events_count = context .get ("execution_events_count" , 0 )
482-
483- logger .info (
484- f"Step Functions extraction - Total request IDs: { len (request_ids )} , Failed functions: { len (failed_functions )} , Events: { execution_events_count } "
485- )
486- logger .info (
487- f"CloudWatch extraction conditions - request_ids: { len (request_ids )} , execution_arn: { bool (execution_arn )} , start_time: { bool (start_time )} , end_time: { bool (end_time )} "
488- )
358+ # X-Ray based request ID extraction
359+ trace_id = context .get ("trace_id" )
360+ function_request_map = {}
489361
490- # NEW: CloudWatch-based request ID extraction if Step Functions extraction failed
491- cloudwatch_extraction_used = False
492- if len ( request_ids ) == 0 and execution_arn and start_time and end_time :
362+ if trace_id :
363+ logger . info ( f"Extracting Lambda request IDs from X-Ray trace: { trace_id } " )
364+ function_request_map = extract_lambda_request_ids ( trace_id )
493365 logger .info (
494- "Step Functions extraction yielded 0 request IDs, attempting CloudWatch log extraction"
495- )
496-
497- # Get log group names for extraction
498- group_names = [g ["name" ] for g in log_groups .get ("log_groups" , [])]
499- execution_id = execution_arn .split (":" )[- 1 ]
500-
501- cloudwatch_extraction = extract_request_ids_from_logs (
502- group_names , execution_id , start_time , end_time
503- )
504-
505- if cloudwatch_extraction .get ("extraction_success" ):
506- # Override Step Functions results with CloudWatch extraction
507- function_request_map = cloudwatch_extraction .get (
508- "function_request_map" , {}
509- )
510- request_ids = cloudwatch_extraction .get ("all_request_ids" , [])
511- cloudwatch_extraction_used = True
512- logger .info (
513- f"CloudWatch extraction successful - Found { len (request_ids )} request IDs from { len (function_request_map )} functions"
514- )
515- else :
516- logger .warning ("CloudWatch extraction also failed to find request IDs" )
517- elif len (request_ids ) == 0 :
518- logger .warning (
519- f"CloudWatch extraction not attempted - missing conditions: execution_arn={ bool (execution_arn )} , start_time={ bool (start_time )} , end_time={ bool (end_time )} "
366+ f"X-Ray extraction found { len (function_request_map )} Lambda functions: { function_request_map } "
520367 )
521368 else :
522- logger .info (
523- f"CloudWatch extraction not needed - Step Functions found { len (request_ids )} request IDs"
524- )
369+ logger .warning ("No trace_id found in document context" )
525370
526- logger .info (
527- f"Total request IDs: { len (request_ids )} , Function mappings: { len (function_request_map )} "
528- )
529- logger .info (f"Function request mapping: { function_request_map } " )
530- logger .info (
531- f"Extraction method: { 'CloudWatch logs' if cloudwatch_extraction_used else 'Step Functions events' } "
532- )
371+ request_ids = list (function_request_map .values ())
372+ failed_functions = context .get ("failed_functions" , [])
373+ primary_failed_function = context .get ("primary_failed_function" )
374+ execution_arn = context .get ("execution_arn" )
533375
534376 # Priority 1: Request IDs from failed functions (highest priority)
535377 failed_function_request_ids = []
@@ -574,6 +416,19 @@ def cloudwatch_document_logs(
574416
575417 # Search with failed function request IDs (highest priority)
576418 for request_id in search_strategy ["failed_function_request_ids" ]:
419+ # Find function name for this request ID
420+ function_name = next (
421+ (
422+ func
423+ for func , rid in function_request_map .items ()
424+ if rid == request_id
425+ ),
426+ "Unknown" ,
427+ )
428+ logger .info (
429+ f"Filtering logs with Lambda function: { function_name } , request_id: { request_id } "
430+ )
431+
577432 for group in groups_to_search :
578433 log_group_name = group ["name" ]
579434 search_result = search_cloudwatch_logs (
@@ -616,6 +471,19 @@ def cloudwatch_document_logs(
616471 for request_id in search_strategy ["other_request_ids" ][
617472 :3
618473 ]: # Limit to first 3
474+ # Find function name for this request ID
475+ function_name = next (
476+ (
477+ func
478+ for func , rid in function_request_map .items ()
479+ if rid == request_id
480+ ),
481+ "Unknown" ,
482+ )
483+ logger .info (
484+ f"Filtering logs with Lambda function: { function_name } , request_id: { request_id } "
485+ )
486+
619487 for group in groups_to_search :
620488 log_group_name = group ["name" ]
621489
@@ -776,10 +644,7 @@ def cloudwatch_document_logs(
776644 "document_status" : context .get ("document_status" ),
777645 "execution_arn" : execution_arn ,
778646 "search_strategy" : search_strategy ,
779- "cloudwatch_extraction_used" : cloudwatch_extraction_used ,
780- "extraction_method" : "cloudwatch_logs"
781- if cloudwatch_extraction_used
782- else "step_functions" ,
647+ "extraction_method" : "xray_trace" ,
783648 "failed_functions" : failed_functions ,
784649 "primary_failed_function" : primary_failed_function ,
785650 "processing_time_window" : {
0 commit comments