From 5059553851d9fe03417149b501cff739b552b522 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 27 Jan 2026 13:51:38 -0600 Subject: [PATCH 01/11] dmesg regex updates --- .../plugins/inband/dmesg/dmesg_analyzer.py | 119 ++++++++++++++++-- 1 file changed, 110 insertions(+), 9 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index dfed6715..881a319d 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -157,8 +157,34 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]): event_category=EventCategory.SW_DRIVER, ), ErrorRegex( - regex=re.compile(r"(?:pcieport )(.*AER: aer_status.*)|(aer_status.*)"), - message="PCIe AER Error", + regex=re.compile( + r"(pcieport [\w:.]+: AER: aer_status:[^\n]*" + r"(?:\n[^\n]*){0,32}?" + r"pcieport [\w:.]+: AER: aer_layer=[^\n]*)", + re.MULTILINE, + ), + message="PCIe AER Error Status", + event_category=EventCategory.SW_DRIVER, + ), + ErrorRegex( + regex=re.compile(r"(.*aer_cor_status: 0x[0-9a-fA-F]+, aer_cor_mask: 0x[0-9a-fA-F]+.*)"), + message="PCIe AER Correctable Error Status", + event_category=EventCategory.SW_DRIVER, + ), + ErrorRegex( + regex=re.compile( + r"(.*aer_uncor_status: 0x[0-9a-fA-F]+, aer_uncor_mask: 0x[0-9a-fA-F]+.*)" + ), + message="PCIe AER Uncorrectable Error Status", + event_category=EventCategory.SW_DRIVER, + ), + ErrorRegex( + regex=re.compile( + r"(.*aer_uncor_severity: 0x[0-9a-fA-F]+.*)" + r"(\n.*TLP Header: (?:0x)?[0-9a-fA-F]+(?: (?:0x)?[0-9a-fA-F]+){3}.*)", + re.MULTILINE, + ), + message="PCIe AER Uncorrectable Error Severity with TLP Header", event_category=EventCategory.SW_DRIVER, ), ErrorRegex( @@ -332,12 +358,63 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]): event_category=EventCategory.BIOS, event_priority=EventPriority.WARNING, ), + ErrorRegex( + regex=re.compile(r"Failed to load MMP firmware qat_4xxx_mmp.bin"), + message="MMP Error", + event_category=EventCategory.BIOS, + event_priority=EventPriority.WARNING, + ), ErrorRegex( regex=re.compile(r"amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU is throttled.*"), message="GPU Throttled", event_category=EventCategory.SW_DRIVER, event_priority=EventPriority.WARNING, ), + ErrorRegex( + regex=re.compile( + r"amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+})?poison is consumed by client \d+, kick off gpu reset flow" + ), + message="RAS Poison Consumed", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile( + r"amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+})?Poison is created" + ), + message="RAS Poison created", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile(r"(amdgpu: Saved bad pages (\d+) reaches threshold value 128)"), + message="Bad page threshold exceeded", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile( + r"Hardware error from APEI Generic Hardware Error Source:.*(?:\n.*){0,14}" + ), + message="RAS Hardware Error", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile(r"Error Address.*(?:\s.*)"), + message="Error Address", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile( + r"EDR: EDR event received", + ), + message="RAS EDR Event", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile( + r"DPC: .*", + ), + message="DPC Event", + event_category=EventCategory.RAS, + ), ErrorRegex( regex=re.compile( r"(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No matching interfaces", @@ -402,34 +479,58 @@ def filter_dmesg( return filtered_dmesg - def _is_known_error(self, known_err_events: list[Event], unknown_match: str) -> bool: + def _is_known_error( + self, + known_err_events: list[Event], + unknown_match: str, + error_regex: Optional[list[ErrorRegex]] = None, + ) -> bool: """Check if a potential unknown error line has a known regex Args: known_err_events (list[Event]): list of events from known regex unknown_match (str): unknown match string + error_regex (Optional[list[ErrorRegex]]): list of error regexes to check against Returns: bool: return True if error is known """ - for regex_obj in self.ERROR_REGEX: + if error_regex is None: + error_regex = self.ERROR_REGEX + + # Normalize whitespace to reduce false negatives (collapse runs, trim) + def _norm(s: str) -> str: + return re.sub(r"\s+", " ", s.strip()) + + unknown_norm = _norm(unknown_match) + + # Direct regex hit counts as known + for regex_obj in error_regex: try: if regex_obj.regex.search(unknown_match): return True except re.error: + # If a bad pattern somehow slipped in, ignore it for unknown detection continue + # Compare against previously matched multi-line or single-line contents for event in known_err_events: known_match = event.data["match_content"] if isinstance(known_match, list): for line in known_match: - if unknown_match == line or unknown_match in line or line in unknown_match: + line_norm = _norm(line) + if ( + unknown_norm == line_norm + or unknown_norm in line_norm + or line_norm in unknown_norm + ): return True elif isinstance(known_match, str): + line_norm = _norm(known_match) if ( - unknown_match == known_match - or unknown_match in known_match - or known_match in unknown_match + unknown_norm == line_norm + or unknown_norm in line_norm + or line_norm in unknown_norm ): return True return False @@ -497,7 +598,7 @@ def analyze_data( for err_event in err_events: match_content = err_event.data["match_content"] - if not self._is_known_error(known_err_events, match_content): + if not self._is_known_error(known_err_events, match_content, self.ERROR_REGEX): self.result.events.append(err_event) return self.result From 846a04ec4f44e0c8e4d8dc79f28827a3ee1fd5b6 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 27 Jan 2026 17:31:47 -0600 Subject: [PATCH 02/11] enhanced pytests --- test/functional/fixtures/dmesg_sample.log | 28 ++++ test/unit/plugin/test_dmesg_analyzer.py | 189 ++++++++++++++++++++++ 2 files changed, 217 insertions(+) diff --git a/test/functional/fixtures/dmesg_sample.log b/test/functional/fixtures/dmesg_sample.log index 181a3ff8..e0844e42 100644 --- a/test/functional/fixtures/dmesg_sample.log +++ b/test/functional/fixtures/dmesg_sample.log @@ -41,3 +41,31 @@ kern :info : 2026-01-07T10:01:15,890123-06:00 NFSD: starting 90-second grace p kern :info : 2026-01-07T10:01:20,123456-06:00 Bluetooth: BNEP (Ethernet Emulation) ver 1.3 kern :info : 2026-01-07T10:01:25,234567-06:00 Bluetooth: BNEP filters: protocol multicast kern :info : 2026-01-07T10:01:30,345678-06:00 System operational - all services started successfully +kern :err : 2026-01-07T10:02:00,123456-06:00 amdgpu 0000:01:00.0: {14}poison is consumed by client 12, kick off gpu reset flow +kern :err : 2026-01-07T10:02:00,234567-06:00 amdgpu 0000:01:00.0: {15}Poison is created +kern :err : 2026-01-07T10:02:01,345678-06:00 amdgpu 0000:02:00.0: amdgpu: Saved bad pages 150 reaches threshold value 128 +kern :err : 2026-01-07T10:03:00,456789-06:00 {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 +kern :err : 2026-01-07T10:03:00,567890-06:00 {1}[Hardware Error]: event severity: recoverable +kern :err : 2026-01-07T10:03:00,678901-06:00 {1}[Hardware Error]: Error 0, type: recoverable +kern :err : 2026-01-07T10:03:00,789012-06:00 {1}[Hardware Error]: section_type: PCIe error +kern :err : 2026-01-07T10:03:00,890123-06:00 {1}[Hardware Error]: port_type: 4, root port +kern :err : 2026-01-07T10:03:01,123456-06:00 {1}[Hardware Error]: version: 3.0 +kern :err : 2026-01-07T10:03:01,234567-06:00 {1}[Hardware Error]: command: 0x0547, status: 0x4010 +kern :err : 2026-01-07T10:03:01,345678-06:00 {1}[Hardware Error]: device_id: 0000:11:01.0 +kern :err : 2026-01-07T10:03:01,456789-06:00 {1}[Hardware Error]: slot: 5 +kern :err : 2026-01-07T10:03:01,567890-06:00 {1}[Hardware Error]: secondary_bus: 0x12 +kern :err : 2026-01-07T10:03:01,678901-06:00 {1}[Hardware Error]: vendor_id: 0x1002, device_id: 0x74a1 +kern :err : 2026-01-07T10:03:01,789012-06:00 {1}[Hardware Error]: class_code: 060400 +kern :err : 2026-01-07T10:03:01,890123-06:00 {1}[Hardware Error]: bridge: secondary_status: 0x2000, control: 0x0003 +kern :err : 2026-01-07T10:04:00,123456-06:00 amdgpu 0000:02:00.0: amdgpu: Error Address(PA):0x12345000 Row:0x1000 Col:0x0 Bank:0x5 Channel:0x10 +kern :err : 2026-01-07T10:05:00,123456-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: Accelerator Check Architecture events logged +kern :err : 2026-01-07T10:05:00,234567-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].CONTROL=0x00000000000003ff +kern :err : 2026-01-07T10:05:00,345678-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].STATUS=0x98000800003e0000 +kern :err : 2026-01-07T10:05:00,456789-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].ADDR=0x0000000000000000 +kern :err : 2026-01-07T10:05:00,567890-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].MISC=0xd008000000000000 +kern :err : 2026-01-07T10:05:00,678901-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].CONFIG=0x00000021000001fb +kern :err : 2026-01-07T10:05:00,789012-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].IPID=0x0001100138430401 +kern :err : 2026-01-07T10:05:00,890123-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].SYND=0x0000000000000000 +kern :err : 2026-01-07T10:05:01,123456-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].DESTAT=0x0000000000000000 +kern :err : 2026-01-07T10:05:01,234567-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].DEADDR=0x0000000000000000 +kern :err : 2026-01-07T10:05:01,345678-06:00 amdgpu 0000:03:00.0: {1}[Hardware Error]: ACA[01/01].CONTROL_MASK=0x0000000000000000 diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 71a6d0b3..e542772e 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -24,6 +24,7 @@ # ############################################################################### import datetime +import pathlib from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.executionstatus import ExecutionStatus @@ -279,3 +280,191 @@ def test_aca(system_info): assert len(res.events) == 1 assert res.events[0].description == "ACA Error" assert res.events[0].priority == EventPriority.ERROR + + +def test_ras_poison_errors(system_info): + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:68:00.0: {14}poison is consumed by client 12, kick off gpu reset flow\n" + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:68:00.0: {15}Poison is created\n" + "kern :info : 2024-11-24T17:53:24,028841-06:00 Normal log entry\n" + "kern :err : 2024-11-24T17:53:25,028841-06:00 amdgpu 0000:01:00.0: poison is consumed by client 5, kick off gpu reset flow\n" + "kern :err : 2024-11-24T17:53:26,028841-06:00 amdgpu 0000:02:00.0: amdgpu: Poison is created\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 4 + + poison_consumed_events = [e for e in res.events if e.description == "RAS Poison Consumed"] + poison_created_events = [e for e in res.events if e.description == "RAS Poison created"] + + assert len(poison_consumed_events) == 2 + assert len(poison_created_events) == 2 + + for event in res.events: + assert event.priority == EventPriority.ERROR + assert event.category == "RAS" + + +def test_bad_page_threshold(system_info): + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:08:00.0: amdgpu: Saved bad pages 176 reaches threshold value 128\n" + "kern :info : 2024-11-24T17:53:24,028841-06:00 Normal log entry\n" + "kern :err : 2024-11-24T17:53:25,028841-06:00 amdgpu: Saved bad pages 200 reaches threshold value 128\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 2 + + for event in res.events: + assert event.description == "Bad page threshold exceeded" + assert event.priority == EventPriority.ERROR + assert event.category == "RAS" + match_content = event.data["match_content"] + if isinstance(match_content, list): + assert "Saved bad pages" in match_content[0] + else: + assert "Saved bad pages" in match_content + + +def test_apei_hardware_error(system_info): + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: event severity: recoverable\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: Error 0, type: recoverable\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: section_type: PCIe error\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: port_type: 4, root port\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: version: 3.0\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: command: 0x0547, status: 0x4010\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: device_id: 0000:54:01.0\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: slot: 19\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: secondary_bus: 0x55\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x352a\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: class_code: 060400\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: bridge: secondary_status: 0x2000, control: 0x0003\n" + "kern :info : 2024-09-21T06:12:54,000000-05:00 Normal log entry\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 1 + + event = res.events[0] + assert event.description == "RAS Hardware Error" + assert event.priority == EventPriority.ERROR + assert event.category == "RAS" + match_content = event.data["match_content"] + if isinstance(match_content, list): + assert any( + "Hardware error from APEI Generic Hardware Error Source" in line + for line in match_content + ) + else: + assert "Hardware error from APEI Generic Hardware Error Source" in match_content + + +def test_error_address_pa(system_info): + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:08:00.0: amdgpu: Error Address(PA):0x60d1a4480 Row:0x1834 Col:0x0 Bank:0x7 Channel:0x74\n" + "kern :info : 2024-11-24T17:53:24,028841-06:00 Normal log entry\n" + "kern :err : 2024-11-24T17:53:25,028841-06:00 Error Address(PA):0x12345678 Row:0x100 Col:0x5 Bank:0x2 Channel:0x10\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 2 + + for event in res.events: + assert event.description == "Error Address" + assert event.priority == EventPriority.ERROR + assert event.category == "RAS" + match_content = event.data["match_content"] + if isinstance(match_content, list): + content_str = " ".join(match_content) + else: + content_str = match_content + assert "Error Address(PA)" in content_str + assert "Row:" in content_str + + +def test_fixture_file_ras_detection(system_info): + fixture_path = ( + pathlib.Path(__file__).parent.parent.parent / "functional" / "fixtures" / "dmesg_sample.log" + ) + with open(fixture_path, "r") as f: + fixture_content = f.read() + + dmesg_data = DmesgData(dmesg_content=fixture_content) + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + + descriptions = [e.description for e in res.events] + assert len(res.events) >= 6, f"Expected at least 6 errors, found {len(res.events)}" + assert "RAS Poison Consumed" in descriptions + assert "RAS Poison created" in descriptions + assert "Bad page threshold exceeded" in descriptions + assert "RAS Hardware Error" in descriptions + assert "Error Address" in descriptions + assert "ACA Error" in descriptions + + +def test_combined_ras_errors(system_info): + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:68:00.0: {14}poison is consumed by client 12, kick off gpu reset flow\n" + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:68:00.0: {15}Poison is created\n" + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:08:00.0: amdgpu: Saved bad pages 176 reaches threshold value 128\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: event severity: recoverable\n" + "kern :err : 2024-09-21T06:12:53,907220-05:00 {1}[Hardware Error]: Error 0, type: recoverable\n" + "kern :err : 2024-11-24T17:53:23,028841-06:00 amdgpu 0000:08:00.0: amdgpu: Error Address(PA):0x60d1a4480 Row:0x1834 Col:0x0 Bank:0x7 Channel:0x74\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False) + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 5 + + descriptions = [e.description for e in res.events] + assert "RAS Poison Consumed" in descriptions + assert "RAS Poison created" in descriptions + assert "Bad page threshold exceeded" in descriptions + assert "RAS Hardware Error" in descriptions + assert "Error Address" in descriptions + + for event in res.events: + assert event.category == "RAS" + assert event.priority == EventPriority.ERROR From 9ade6f209e9e9aaed25545cb9d22e442a126fd10 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 27 Jan 2026 19:02:14 -0600 Subject: [PATCH 03/11] enhanced with timestamp parsing and collapsing --- nodescraper/base/regexanalyzer.py | 129 ++++++++++++++++-- .../plugins/inband/dmesg/analyzer_args.py | 2 + .../plugins/inband/dmesg/dmesg_analyzer.py | 8 +- 3 files changed, 130 insertions(+), 9 deletions(-) diff --git a/nodescraper/base/regexanalyzer.py b/nodescraper/base/regexanalyzer.py index 17f55f5a..abb90d2e 100644 --- a/nodescraper/base/regexanalyzer.py +++ b/nodescraper/base/regexanalyzer.py @@ -23,8 +23,9 @@ # SOFTWARE. # ############################################################################### +import datetime import re -from typing import Union +from typing import Optional, Union from pydantic import BaseModel @@ -54,6 +55,33 @@ def count(self, val: int): class RegexAnalyzer(DataAnalyzer[TDataModel, TAnalyzeArg]): """Parent class for all regex based data analyzers.""" + # Class variable for timestamp pattern - can be overridden in subclasses + TIMESTAMP_PATTERN: re.Pattern = re.compile(r"(\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)") + + def _extract_timestamp_from_match_position( + self, content: str, match_start: int + ) -> Optional[str]: + """Extract timestamp from the line where a regex match starts. + + Args: + content (str): Full content being analyzed + match_start (int): Start position of the regex match + + Returns: + Optional[str]: Extracted timestamp string or None + """ + # Get the line where the match starts + line_start = content.rfind("\n", 0, match_start) + 1 + line_end = content.find("\n", match_start) + if line_end == -1: + line_end = len(content) + + first_line = content[line_start:line_end] + + # Extract timestamp from first line only using class pattern + timestamp_match = self.TIMESTAMP_PATTERN.search(first_line) + return timestamp_match.group(1) if timestamp_match else None + def _build_regex_event( self, regex_obj: ErrorRegex, match: Union[str, list[str]], source: str ) -> RegexEvent: @@ -82,15 +110,28 @@ def _build_regex_event( ) def check_all_regexes( - self, content: str, source: str, error_regex: list[ErrorRegex], group=True + self, + content: str, + source: str, + error_regex: list[ErrorRegex], + group: bool = True, + num_timestamps: int = 3, + interval_to_collapse_event: int = 60, ) -> list[RegexEvent]: """Iterate over all ERROR_REGEX and check content for any matches + Enhanced with timestamp-based event collapsing: + - Extracts timestamps from matched lines + - Collapses events within interval_to_collapse_event seconds + - Prunes timestamp lists to keep first N and last N timestamps + Args: content (str): content to match regex on source (str): descriptor for content error_regex (list[ErrorRegex]): list of regex objects to match group (bool, optional): flag to control whether matches should be grouped together. Defaults to True. + num_timestamps (int, optional): maximum number of timestamps to keep for each event. Defaults to 3. + interval_to_collapse_event (int, optional): time interval in seconds to collapse events. Defaults to 60. Returns: list[RegexEvent]: list of regex event objects @@ -99,8 +140,39 @@ def check_all_regexes( regex_map: dict[str, RegexEvent] = {} regex_event_list: list[RegexEvent] = [] + def _is_within_interval(new_timestamp_str: str, existing_timestamps: list[str]) -> bool: + """Check if new timestamp is within the specified interval of any existing timestamp""" + try: + new_dt = datetime.datetime.fromisoformat(new_timestamp_str.replace(",", ".")) + except Exception as e: + self.logger.warning( + f"WARNING: Failed to parse date from timestamp: {new_timestamp_str}. Error: {e}" + ) + return False + + if not new_dt: + return False + + for existing_ts in existing_timestamps: + try: + existing_dt = datetime.datetime.fromisoformat(existing_ts.replace(",", ".")) + if ( + existing_dt + and abs((new_dt - existing_dt).total_seconds()) < interval_to_collapse_event + ): + return True + except Exception: + continue + return False + for error_regex_obj in error_regex: - for match in error_regex_obj.regex.findall(content): + for match_obj in error_regex_obj.regex.finditer(content): + # Extract timestamp from the line where match occurs + timestamp = self._extract_timestamp_from_match_position(content, match_obj.start()) + + match = match_obj.groups() if match_obj.groups() else match_obj.group(0) + + # Process multi-line matches if isinstance(match, str) and "\n" in match: match = match.strip().split("\n") @@ -110,11 +182,52 @@ def check_all_regexes( if len(match) == 1: match = match[0] - if group and str(match) in regex_map: - regex_map[str(match)].count += 1 + # Create match key for grouping + match_key = str(match) + + if group and match_key in regex_map: + # Increment count for existing match + existing_event = regex_map[match_key] + existing_event.count += 1 + + # Add timestamp to timestamps list if we have one + if timestamp: + timestamps_list = existing_event.data.get("timestamps", []) + # Check if new timestamp is within the specified interval of existing ones + if not _is_within_interval(timestamp, timestamps_list): + timestamps_list.append(timestamp) + existing_event.data["timestamps"] = timestamps_list + elif group: - regex_map[str(match)] = self._build_regex_event(error_regex_obj, match, source) + # Create new grouped event + new_event = self._build_regex_event(error_regex_obj, match, source) + + # Add timestamp information + if timestamp: + new_event.data["timestamps"] = [timestamp] + + regex_map[match_key] = new_event + else: - regex_event_list.append(self._build_regex_event(error_regex_obj, match, source)) + # Create individual event (no grouping) + new_event = self._build_regex_event(error_regex_obj, match, source) + + # Add single timestamp + if timestamp: + new_event.data["timestamp"] = timestamp + + regex_event_list.append(new_event) + + all_events = list(regex_map.values()) if group else regex_event_list + + # Prune timestamp lists to keep only first N and last N timestamps + for event in all_events: + timestamps_list = event.data.get("timestamps", []) + if isinstance(timestamps_list, list) and len(timestamps_list) > 2 * num_timestamps: + # Keep first num_timestamps and last num_timestamps + pruned_timestamps = ( + timestamps_list[:num_timestamps] + timestamps_list[-num_timestamps:] + ) + event.data["timestamps"] = pruned_timestamps - return list(regex_map.values()) if group else regex_event_list + return all_events diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index 62bd7bd3..d1e02156 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -31,3 +31,5 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): check_unknown_dmesg_errors: Optional[bool] = True exclude_category: Optional[set[str]] = None + interval_to_collapse_event: int = 60 + num_timestamps: int = 3 diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 881a319d..f2fdf6bd 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -571,7 +571,11 @@ def analyze_data( dmesg_content = data.dmesg_content known_err_events = self.check_all_regexes( - content=dmesg_content, source="dmesg", error_regex=self.ERROR_REGEX + content=dmesg_content, + source="dmesg", + error_regex=self.ERROR_REGEX, + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, ) if args.exclude_category: known_err_events = [ @@ -594,6 +598,8 @@ def analyze_data( event_priority=EventPriority.WARNING, ) ], + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, ) for err_event in err_events: From e39ca71e1fb0278affbb9d0dbbf9bc4933ce8350 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 28 Jan 2026 12:07:26 -0600 Subject: [PATCH 04/11] extra regex functionality --- nodescraper/base/regexanalyzer.py | 40 +++ .../plugins/inband/dmesg/dmesg_analyzer.py | 8 +- test/unit/framework/test_regexanalyzer.py | 236 ++++++++++++++++++ 3 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 test/unit/framework/test_regexanalyzer.py diff --git a/nodescraper/base/regexanalyzer.py b/nodescraper/base/regexanalyzer.py index abb90d2e..c1f86bfd 100644 --- a/nodescraper/base/regexanalyzer.py +++ b/nodescraper/base/regexanalyzer.py @@ -82,6 +82,46 @@ def _extract_timestamp_from_match_position( timestamp_match = self.TIMESTAMP_PATTERN.search(first_line) return timestamp_match.group(1) if timestamp_match else None + def _convert_and_extend_error_regex( + self, custom_regex: Optional[list[ErrorRegex] | list[dict]], base_regex: list[ErrorRegex] + ) -> list[ErrorRegex]: + """Convert custom error patterns and extend base ERROR_REGEX. + + Supports two input formats: + - ErrorRegex objects directly + - Dicts with regex/message/category/priority that get converted to ErrorRegex + + Args: + custom_regex: Optional list of custom error patterns (ErrorRegex objects or dicts) + base_regex: Base list of ErrorRegex patterns to extend + + Returns: + Extended list of ErrorRegex objects (custom patterns + base patterns) + + Example: + custom = [ + {"regex": r"my-error.*", "message": "Custom error", "event_category": "SW_DRIVER"} + ] + extended = analyzer._convert_and_extend_error_regex(custom, analyzer.ERROR_REGEX) + """ + if not custom_regex or not isinstance(custom_regex, list): + return list(base_regex) + + converted_regex = [] + for item in custom_regex: + if isinstance(item, ErrorRegex): + converted_regex.append(item) + elif isinstance(item, dict): + # Convert dict to ErrorRegex + item["regex"] = re.compile(item["regex"]) + if "event_category" in item: + item["event_category"] = EventCategory(item["event_category"]) + if "event_priority" in item: + item["event_priority"] = EventPriority(item["event_priority"]) + converted_regex.append(ErrorRegex(**item)) + + return converted_regex + list(base_regex) + def _build_regex_event( self, regex_obj: ErrorRegex, match: Union[str, list[str]], source: str ) -> RegexEvent: diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index f2fdf6bd..c0e3702c 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -539,12 +539,14 @@ def analyze_data( self, data: DmesgData, args: Optional[DmesgAnalyzerArgs] = None, + error_regex: Optional[list[ErrorRegex] | list[dict]] = None, ) -> TaskResult: """Analyze dmesg data for errors Args: data (DmesgData): dmesg data to analyze args (Optional[DmesgAnalyzerArgs], optional): dmesg analysis arguments. Defaults to None. + error_regex (Optional[list[ErrorRegex] | list[dict]]): custom error regexes to extend ERROR_REGEX Returns: TaskResult: The result of the analysis containing status and message. @@ -553,6 +555,8 @@ def analyze_data( if not args: args = DmesgAnalyzerArgs() + final_error_regex = self._convert_and_extend_error_regex(error_regex, self.ERROR_REGEX) + if args.analysis_range_start or args.analysis_range_end: self.logger.info( "Filtering dmesg using range %s - %s", @@ -573,7 +577,7 @@ def analyze_data( known_err_events = self.check_all_regexes( content=dmesg_content, source="dmesg", - error_regex=self.ERROR_REGEX, + error_regex=final_error_regex, num_timestamps=args.num_timestamps, interval_to_collapse_event=args.interval_to_collapse_event, ) @@ -604,7 +608,7 @@ def analyze_data( for err_event in err_events: match_content = err_event.data["match_content"] - if not self._is_known_error(known_err_events, match_content, self.ERROR_REGEX): + if not self._is_known_error(known_err_events, match_content, final_error_regex): self.result.events.append(err_event) return self.result diff --git a/test/unit/framework/test_regexanalyzer.py b/test/unit/framework/test_regexanalyzer.py new file mode 100644 index 00000000..dc8c5576 --- /dev/null +++ b/test/unit/framework/test_regexanalyzer.py @@ -0,0 +1,236 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re + +from pydantic import BaseModel + +from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer +from nodescraper.enums import EventCategory, EventPriority +from nodescraper.models.datamodel import DataModel + + +class DummyData(DataModel): + pass + + +class DummyArgs(BaseModel): + pass + + +class TestRegexAnalyzer(RegexAnalyzer[DummyData, DummyArgs]): + DATA_MODEL = DummyData + + ERROR_REGEX = [ + ErrorRegex( + regex=re.compile(r"base error 1"), + message="Base Error 1", + event_category=EventCategory.SW_DRIVER, + ), + ErrorRegex( + regex=re.compile(r"base error 2"), + message="Base Error 2", + event_category=EventCategory.OS, + event_priority=EventPriority.WARNING, + ), + ] + + def analyze_data(self, data, args=None): + pass + + +def test_convert_and_extend_with_none(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + result = analyzer._convert_and_extend_error_regex(None, base_regex) + + assert len(result) == 2 + assert result[0].message == "Base Error 1" + assert result[1].message == "Base Error 2" + + +def test_convert_and_extend_with_empty_list(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + result = analyzer._convert_and_extend_error_regex([], base_regex) + + assert len(result) == 2 + assert result[0].message == "Base Error 1" + assert result[1].message == "Base Error 2" + + +def test_convert_and_extend_with_error_regex_objects(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + custom_regex = [ + ErrorRegex( + regex=re.compile(r"custom error 1"), + message="Custom Error 1", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile(r"custom error 2"), + message="Custom Error 2", + event_category=EventCategory.BIOS, + event_priority=EventPriority.CRITICAL, + ), + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(result) == 4 + assert result[0].message == "Custom Error 1" + assert result[0].event_category == EventCategory.RAS + assert result[0].event_priority == EventPriority.ERROR + assert result[1].message == "Custom Error 2" + assert result[1].event_category == EventCategory.BIOS + assert result[1].event_priority == EventPriority.CRITICAL + assert result[2].message == "Base Error 1" + assert result[3].message == "Base Error 2" + + +def test_convert_and_extend_with_dict_format(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + custom_regex = [ + { + "regex": r"custom dict error 1", + "message": "Custom Dict Error 1", + "event_category": "RAS", + }, + { + "regex": r"custom dict error 2", + "message": "Custom Dict Error 2", + "event_category": "IO", + "event_priority": 2, + }, + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(result) == 4 + assert result[0].message == "Custom Dict Error 1" + assert result[0].event_category == EventCategory.RAS + assert result[0].event_priority == EventPriority.ERROR + assert isinstance(result[0].regex, re.Pattern) + assert result[1].message == "Custom Dict Error 2" + assert result[1].event_category == EventCategory.IO + assert result[1].event_priority == EventPriority.WARNING + assert isinstance(result[1].regex, re.Pattern) + assert result[2].message == "Base Error 1" + assert result[3].message == "Base Error 2" + + +def test_convert_and_extend_with_mixed_formats(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + custom_regex = [ + ErrorRegex( + regex=re.compile(r"error regex object"), + message="Error Regex Object", + event_category=EventCategory.NETWORK, + ), + { + "regex": r"error dict object", + "message": "Error Dict Object", + "event_category": "SW_DRIVER", + "event_priority": 4, + }, + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(result) == 4 + assert result[0].message == "Error Regex Object" + assert result[0].event_category == EventCategory.NETWORK + assert result[1].message == "Error Dict Object" + assert result[1].event_category == EventCategory.SW_DRIVER + assert result[1].event_priority == EventPriority.CRITICAL + assert result[2].message == "Base Error 1" + assert result[3].message == "Base Error 2" + + +def test_convert_and_extend_dict_without_optional_fields(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + custom_regex = [ + { + "regex": r"minimal error", + "message": "Minimal Error", + } + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(result) == 3 + assert result[0].message == "Minimal Error" + assert result[0].event_category == EventCategory.UNKNOWN + assert result[0].event_priority == EventPriority.ERROR + + +def test_convert_and_extend_regex_patterns_work(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + + custom_regex = [ + { + "regex": r"test\s+error\s+\d+", + "message": "Test Error Pattern", + "event_category": "SW_DRIVER", + } + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(result) == 3 + test_string_match = "test error 123" + test_string_no_match = "test error abc" + + assert result[0].regex.search(test_string_match) is not None + assert result[0].regex.search(test_string_no_match) is None + + +def test_convert_and_extend_preserves_base_regex(system_info): + analyzer = TestRegexAnalyzer(system_info=system_info) + base_regex = analyzer.ERROR_REGEX + original_base_length = len(base_regex) + + custom_regex = [ + { + "regex": r"custom error", + "message": "Custom Error", + } + ] + + result = analyzer._convert_and_extend_error_regex(custom_regex, base_regex) + + assert len(base_regex) == original_base_length + assert len(result) == original_base_length + 1 From 7036e92d72acad669ec07e3a187dcc479a5906ad Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 28 Jan 2026 13:34:07 -0600 Subject: [PATCH 05/11] py3.9 syntax --- nodescraper/base/regexanalyzer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nodescraper/base/regexanalyzer.py b/nodescraper/base/regexanalyzer.py index c1f86bfd..603d6c45 100644 --- a/nodescraper/base/regexanalyzer.py +++ b/nodescraper/base/regexanalyzer.py @@ -83,7 +83,9 @@ def _extract_timestamp_from_match_position( return timestamp_match.group(1) if timestamp_match else None def _convert_and_extend_error_regex( - self, custom_regex: Optional[list[ErrorRegex] | list[dict]], base_regex: list[ErrorRegex] + self, + custom_regex: Optional[Union[list[ErrorRegex], list[dict]]], + base_regex: list[ErrorRegex], ) -> list[ErrorRegex]: """Convert custom error patterns and extend base ERROR_REGEX. From d74234d597f6d1aeb1840c6961aad4e5cc9d15cf Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 28 Jan 2026 15:56:14 -0600 Subject: [PATCH 06/11] py3.9 --- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index c0e3702c..cd7e0674 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -25,7 +25,7 @@ ############################################################################### import datetime import re -from typing import Optional +from typing import Optional, Union from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer from nodescraper.connection.inband import TextFileArtifact @@ -539,14 +539,14 @@ def analyze_data( self, data: DmesgData, args: Optional[DmesgAnalyzerArgs] = None, - error_regex: Optional[list[ErrorRegex] | list[dict]] = None, + error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = None, ) -> TaskResult: """Analyze dmesg data for errors Args: data (DmesgData): dmesg data to analyze args (Optional[DmesgAnalyzerArgs], optional): dmesg analysis arguments. Defaults to None. - error_regex (Optional[list[ErrorRegex] | list[dict]]): custom error regexes to extend ERROR_REGEX + error_regex (Optional[Union[list[ErrorRegex], list[dict]]]): custom error regexes to extend ERROR_REGEX Returns: TaskResult: The result of the analysis containing status and message. From 4950567a66bb788b8d2494d74f915a91a0a07f39 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 29 Jan 2026 11:37:54 -0600 Subject: [PATCH 07/11] utests for time collapsing + custom regex in configs --- .../fixtures/dmesg_plugin_config.json | 6 +- .../dmesg_plugin_config_custom_regex.json | 34 +++ .../fixtures/dmesg_sample_with_collapsing.log | 30 ++ test/functional/test_plugin_configs.py | 258 +++++++++++++++++ test/unit/plugin/test_dmesg_analyzer.py | 271 ++++++++++++++++++ 5 files changed, 597 insertions(+), 2 deletions(-) create mode 100644 test/functional/fixtures/dmesg_plugin_config_custom_regex.json create mode 100644 test/functional/fixtures/dmesg_sample_with_collapsing.log diff --git a/test/functional/fixtures/dmesg_plugin_config.json b/test/functional/fixtures/dmesg_plugin_config.json index 6b40439a..4424e741 100644 --- a/test/functional/fixtures/dmesg_plugin_config.json +++ b/test/functional/fixtures/dmesg_plugin_config.json @@ -4,11 +4,13 @@ "DmesgPlugin": { "analysis_args": { "check_unknown_dmesg_errors": true, - "exclude_category": null + "exclude_category": null, + "interval_to_collapse_event": 60, + "num_timestamps": 3 } } }, "result_collators": {}, "name": "DmesgPlugin config", - "desc": "Config for testing DmesgPlugin" + "desc": "Config for testing DmesgPlugin with event collapsing" } diff --git a/test/functional/fixtures/dmesg_plugin_config_custom_regex.json b/test/functional/fixtures/dmesg_plugin_config_custom_regex.json new file mode 100644 index 00000000..a6845869 --- /dev/null +++ b/test/functional/fixtures/dmesg_plugin_config_custom_regex.json @@ -0,0 +1,34 @@ +{ + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "analysis_args": { + "check_unknown_dmesg_errors": false, + "interval_to_collapse_event": 60, + "num_timestamps": 3, + "error_regex": [ + { + "regex": "CUSTOM_ERROR_TYPE_1.*", + "message": "Custom Error Type 1", + "event_category": "SW_DRIVER", + "event_priority": 4 + }, + { + "regex": "CUSTOM_ERROR_TYPE_2.*", + "message": "Custom Error Type 2", + "event_category": "RAS", + "event_priority": 3 + }, + { + "regex": "APPLICATION_CRASH: .*", + "message": "Application Crash Detected", + "event_category": "SW_DRIVER" + } + ] + } + } + }, + "result_collators": {}, + "name": "DmesgPlugin config with custom regex", + "desc": "Config for testing DmesgPlugin with custom error regex and event collapsing" +} diff --git a/test/functional/fixtures/dmesg_sample_with_collapsing.log b/test/functional/fixtures/dmesg_sample_with_collapsing.log new file mode 100644 index 00000000..418db504 --- /dev/null +++ b/test/functional/fixtures/dmesg_sample_with_collapsing.log @@ -0,0 +1,30 @@ +kern :info : 2026-01-07T10:00:00,123456-06:00 Linux version 5.15.0-91-generic (buildd@amd64-builder) (gcc version 11.4.0) #101-Ubuntu SMP +kern :info : 2026-01-07T10:00:01,234567-06:00 Command line: BOOT_IMAGE=/boot/vmlinuz-5.15.0-91-generic root=UUID=a1b2c3d4 ro quiet splash +kern :info : 2026-01-07T10:00:02,345678-06:00 Memory: 32823616K/33554432K available +kern :err : 2026-01-07T10:00:10,000000-06:00 CUSTOM_ERROR_TYPE_1: Device initialization failed on port 0x1234 +kern :err : 2026-01-07T10:00:20,000000-06:00 CUSTOM_ERROR_TYPE_1: Device initialization failed on port 0x1235 +kern :err : 2026-01-07T10:00:30,000000-06:00 CUSTOM_ERROR_TYPE_1: Device initialization failed on port 0x1236 +kern :err : 2026-01-07T10:00:40,000000-06:00 oom_kill_process: killed process 1234 +kern :err : 2026-01-07T10:00:50,000000-06:00 oom_kill_process: killed process 1235 +kern :info : 2026-01-07T10:01:00,000000-06:00 System attempting recovery +kern :err : 2026-01-07T10:01:10,000000-06:00 CUSTOM_ERROR_TYPE_2: Hardware sensor malfunction detected +kern :err : 2026-01-07T10:02:00,000000-06:00 oom_kill_process: killed process 1236 +kern :err : 2026-01-07T10:02:10,000000-06:00 CUSTOM_ERROR_TYPE_1: Device initialization failed on port 0x1237 +kern :err : 2026-01-07T10:02:20,000000-06:00 IO_PAGE_FAULT: address=0xfffffffffffffef0 +kern :err : 2026-01-07T10:02:30,000000-06:00 IO_PAGE_FAULT: address=0xfffffffffffffef1 +kern :err : 2026-01-07T10:02:40,000000-06:00 IO_PAGE_FAULT: address=0xfffffffffffffef2 +kern :info : 2026-01-07T10:03:00,000000-06:00 Network interface eth0 up +kern :err : 2026-01-07T10:03:10,000000-06:00 APPLICATION_CRASH: Process nginx (PID 5678) terminated unexpectedly +kern :err : 2026-01-07T10:04:00,000000-06:00 oom_kill_process: killed process 1237 +kern :err : 2026-01-07T10:04:10,000000-06:00 CUSTOM_ERROR_TYPE_2: Hardware sensor malfunction detected +kern :err : 2026-01-07T10:05:00,000000-06:00 IO_PAGE_FAULT: address=0xfffffffffffffef3 +kern :err : 2026-01-07T10:06:00,000000-06:00 oom_kill_process: killed process 1238 +kern :err : 2026-01-07T10:06:10,000000-06:00 APPLICATION_CRASH: Process apache2 (PID 9012) terminated unexpectedly +kern :err : 2026-01-07T10:07:00,000000-06:00 CUSTOM_ERROR_TYPE_1: Device initialization failed on port 0x1238 +kern :err : 2026-01-07T10:08:00,000000-06:00 oom_kill_process: killed process 1239 +kern :err : 2026-01-07T10:08:10,000000-06:00 IO_PAGE_FAULT: address=0xfffffffffffffef4 +kern :info : 2026-01-07T10:09:00,000000-06:00 System stabilized +kern :err : 2026-01-07T10:10:00,000000-06:00 oom_kill_process: killed process 1240 +kern :err : 2026-01-07T10:10:10,000000-06:00 CUSTOM_ERROR_TYPE_2: Hardware sensor malfunction detected +kern :err : 2026-01-07T10:10:20,000000-06:00 APPLICATION_CRASH: Process mysql (PID 3456) terminated unexpectedly +kern :info : 2026-01-07T10:11:00,000000-06:00 All services operational diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index d42382ce..7f4ea6ce 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -334,3 +334,261 @@ def test_dmesg_plugin_log_dmesg_data_true(run_cli_command, tmp_path): if dmesg_plugin_dir.exists(): dmesg_log_files = list(dmesg_plugin_dir.glob("dmesg*.log")) assert len(dmesg_log_files) > 0, "Expected dmesg.log file when log_dmesg_data=True" + + +def test_dmesg_plugin_with_custom_regex_in_config(run_cli_command, tmp_path): + """Test DmesgPlugin with custom error regex passed through plugin config""" + # Create a test dmesg fixture with custom errors + test_dmesg_content = """kern :err : 2026-01-07T10:00:00,000000-06:00 CUSTOM_APP_CRASH: Application XYZ crashed +kern :err : 2026-01-07T10:00:05,000000-06:00 CUSTOM_DRIVER_TIMEOUT: Driver timeout occurred +kern :err : 2026-01-07T10:00:10,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:00:15,000000-06:00 CUSTOM_APP_CRASH: Application ABC crashed +""" + + dmesg_file = tmp_path / "custom_dmesg.log" + dmesg_file.write_text(test_dmesg_content) + + # Create config with custom regex + config = { + "name": "DmesgCustomRegexConfig", + "desc": "DmesgPlugin config with custom error regex", + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "collection_args": {"dmesg_file": str(dmesg_file)}, + "analysis_args": { + "check_unknown_dmesg_errors": False, + "error_regex": [ + { + "regex": r"CUSTOM_APP_CRASH.*", + "message": "Custom Application Crash", + "event_category": "SW_DRIVER", + "event_priority": 4, + }, + { + "regex": r"CUSTOM_DRIVER_TIMEOUT.*", + "message": "Custom Driver Timeout", + "event_category": "SW_DRIVER", + "event_priority": 3, + }, + ], + }, + } + }, + "result_collators": {}, + } + + config_file = tmp_path / "custom_regex_config.json" + config_file.write_text(json.dumps(config, indent=2)) + + log_path = str(tmp_path / "logs_custom_regex") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + # Check that command ran successfully + assert result.returncode in [0, 1, 2] + + # Verify results JSON contains custom errors + results_dir = Path(log_path) / "dmesg_plugin" / "dmesg_analyzer" + if results_dir.exists(): + result_files = list(results_dir.glob("results*.json")) + if result_files: + with open(result_files[0], "r") as f: + results = json.load(f) + events = results.get("events", []) + + # Should detect custom errors and base errors + descriptions = [e["description"] for e in events] + assert "Custom Application Crash" in descriptions or len(events) > 0 + assert "Out of memory error" in descriptions or len(events) > 0 + + +def test_dmesg_plugin_with_event_collapsing_config(run_cli_command, tmp_path): + """Test DmesgPlugin with event collapsing parameters in config""" + # Create a test dmesg with repeated errors at different intervals + test_dmesg_content = """kern :err : 2026-01-07T10:00:00,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:00:30,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:00:50,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:02:00,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:04:00,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:06:00,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:08:00,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:10:00,000000-06:00 oom_kill_process +""" + + dmesg_file = tmp_path / "collapse_dmesg.log" + dmesg_file.write_text(test_dmesg_content) + + # Create config with event collapsing parameters + config = { + "name": "DmesgEventCollapsingConfig", + "desc": "DmesgPlugin config with event collapsing", + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "collection_args": {"dmesg_file": str(dmesg_file)}, + "analysis_args": { + "check_unknown_dmesg_errors": False, + "interval_to_collapse_event": 60, + "num_timestamps": 2, + }, + } + }, + "result_collators": {}, + } + + config_file = tmp_path / "collapse_config.json" + config_file.write_text(json.dumps(config, indent=2)) + + log_path = str(tmp_path / "logs_collapse") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + + # Verify results JSON shows collapsed events + results_dir = Path(log_path) / "dmesg_plugin" / "dmesg_analyzer" + if results_dir.exists(): + result_files = list(results_dir.glob("results*.json")) + if result_files: + with open(result_files[0], "r") as f: + results = json.load(f) + events = results.get("events", []) + + # Should have 1 event with count > 1 + oom_events = [ + e for e in events if "memory error" in e.get("description", "").lower() + ] + if oom_events: + # Verify event was collapsed (count > 1) + assert oom_events[0]["data"]["count"] > 1 + # Verify timestamps list exists + assert "timestamps" in oom_events[0]["data"] + # Timestamps should be pruned (first 2 + last 2) + timestamps = oom_events[0]["data"]["timestamps"] + assert len(timestamps) <= 4 + + +def test_dmesg_plugin_with_custom_regex_and_collapsing(run_cli_command, tmp_path): + """Test DmesgPlugin with both custom regex and event collapsing""" + # Create test dmesg with repeated custom errors + test_dmesg_content = """kern :err : 2026-01-07T10:00:00,000000-06:00 MY_DRIVER_ERROR: Failed to initialize device +kern :err : 2026-01-07T10:00:10,000000-06:00 MY_DRIVER_ERROR: Failed to initialize device +kern :err : 2026-01-07T10:00:20,000000-06:00 MY_DRIVER_ERROR: Failed to initialize device +kern :err : 2026-01-07T10:02:00,000000-06:00 MY_DRIVER_ERROR: Failed to initialize device +kern :err : 2026-01-07T10:04:00,000000-06:00 MY_DRIVER_ERROR: Failed to initialize device +kern :err : 2026-01-07T10:00:05,000000-06:00 oom_kill_process +kern :err : 2026-01-07T10:02:05,000000-06:00 oom_kill_process +""" + + dmesg_file = tmp_path / "custom_collapse_dmesg.log" + dmesg_file.write_text(test_dmesg_content) + + # Create config with both features + config = { + "name": "DmesgCustomRegexAndCollapsingConfig", + "desc": "DmesgPlugin config with custom regex and event collapsing", + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "collection_args": {"dmesg_file": str(dmesg_file)}, + "analysis_args": { + "check_unknown_dmesg_errors": False, + "interval_to_collapse_event": 60, + "num_timestamps": 2, + "error_regex": [ + { + "regex": r"MY_DRIVER_ERROR.*", + "message": "My Custom Driver Error", + "event_category": "SW_DRIVER", + } + ], + }, + } + }, + "result_collators": {}, + } + + config_file = tmp_path / "custom_collapse_config.json" + config_file.write_text(json.dumps(config, indent=2)) + + log_path = str(tmp_path / "logs_custom_collapse") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + + # Verify results + results_dir = Path(log_path) / "dmesg_plugin" / "dmesg_analyzer" + if results_dir.exists(): + result_files = list(results_dir.glob("results*.json")) + if result_files: + with open(result_files[0], "r") as f: + results = json.load(f) + events = results.get("events", []) + + # Look for custom driver error + custom_events = [ + e for e in events if "Custom Driver Error" in e.get("description", "") + ] + if custom_events: + # Should be collapsed + assert custom_events[0]["data"]["count"] >= 1 + assert "timestamps" in custom_events[0]["data"] + + +def test_dmesg_plugin_different_collapse_intervals(run_cli_command, tmp_path): + """Test DmesgPlugin with different collapse interval values""" + # Create test dmesg with errors 10 seconds apart + test_dmesg_content = """kern :err : 2026-01-07T10:00:00,000000-06:00 IO_PAGE_FAULT +kern :err : 2026-01-07T10:00:10,000000-06:00 IO_PAGE_FAULT +kern :err : 2026-01-07T10:00:20,000000-06:00 IO_PAGE_FAULT +kern :err : 2026-01-07T10:00:30,000000-06:00 IO_PAGE_FAULT +""" + + dmesg_file = tmp_path / "interval_dmesg.log" + dmesg_file.write_text(test_dmesg_content) + + # Test with small interval (5 seconds) - should NOT collapse + config_small = { + "name": "DmesgSmallIntervalConfig", + "desc": "DmesgPlugin with 5-second collapse interval", + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "collection_args": {"dmesg_file": str(dmesg_file)}, + "analysis_args": { + "check_unknown_dmesg_errors": False, + "interval_to_collapse_event": 5, + }, + } + }, + "result_collators": {}, + } + + config_file_small = tmp_path / "small_interval_config.json" + config_file_small.write_text(json.dumps(config_small, indent=2)) + + log_path_small = str(tmp_path / "logs_small_interval") + result = run_cli_command( + ["--log-path", log_path_small, "--plugin-configs", str(config_file_small)], check=False + ) + + assert result.returncode in [0, 1, 2] + + # Verify results have all timestamps (not collapsed much) + results_dir = Path(log_path_small) / "dmesg_plugin" / "dmesg_analyzer" + if results_dir.exists(): + result_files = list(results_dir.glob("results*.json")) + if result_files: + with open(result_files[0], "r") as f: + results = json.load(f) + events = results.get("events", []) + io_events = [e for e in events if "Page Fault" in e.get("description", "")] + if io_events: + # Should have multiple timestamps since interval is small + timestamps = io_events[0]["data"].get("timestamps", []) + assert len(timestamps) >= 3 diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index e542772e..4500baf8 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -468,3 +468,274 @@ def test_combined_ras_errors(system_info): for event in res.events: assert event.category == "RAS" assert event.priority == EventPriority.ERROR + + +def test_custom_regex_dict_passed_to_analyzer(system_info): + """Test passing custom regex as dict through error_regex parameter""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 custom_error_pattern_123\n" + "kern :err : 2026-01-07T10:00:01,000000-06:00 another_custom_error_xyz\n" + "kern :err : 2026-01-07T10:00:02,000000-06:00 oom_kill_process\n" + ) + ) + + custom_regex = [ + { + "regex": r"custom_error_pattern_\d+", + "message": "Custom Error Pattern", + "event_category": "SW_DRIVER", + "event_priority": 3, + }, + { + "regex": r"another_custom_error_\w+", + "message": "Another Custom Error", + "event_category": "RAS", + }, + ] + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), + error_regex=custom_regex, + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 3 + + descriptions = [e.description for e in res.events] + assert "Custom Error Pattern" in descriptions + assert "Another Custom Error" in descriptions + assert "Out of memory error" in descriptions + + custom_event1 = next(e for e in res.events if e.description == "Custom Error Pattern") + assert custom_event1.category == "SW_DRIVER" + assert custom_event1.priority == EventPriority.CRITICAL + + custom_event2 = next(e for e in res.events if e.description == "Another Custom Error") + assert custom_event2.category == "RAS" + assert custom_event2.priority == EventPriority.ERROR + + +def test_event_collapsing_within_interval(system_info): + """Test that events within interval_to_collapse_event are collapsed""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:00:30,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:00:50,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:02:00,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:04:00,000000-06:00 oom_kill_process\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, interval_to_collapse_event=60), + ) + + assert res.status == ExecutionStatus.ERROR + assert len(res.events) == 1 + event = res.events[0] + assert event.description == "Out of memory error" + assert event.data["count"] == 5 + + timestamps = event.data.get("timestamps", []) + assert len(timestamps) == 3 # First, one at 2min, one at 4min + + +def test_event_collapsing_with_different_intervals(system_info): + """Test event collapsing with different interval values""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 IO_PAGE_FAULT\n" + "kern :err : 2026-01-07T10:00:10,000000-06:00 IO_PAGE_FAULT\n" + "kern :err : 2026-01-07T10:00:20,000000-06:00 IO_PAGE_FAULT\n" + "kern :err : 2026-01-07T10:00:30,000000-06:00 IO_PAGE_FAULT\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, interval_to_collapse_event=5), + ) + + assert len(res.events) == 1 + event = res.events[0] + assert event.data["count"] == 4 + timestamps = event.data.get("timestamps", []) + assert len(timestamps) == 4 + + # Test with 100-second interval - should collapse all + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, interval_to_collapse_event=100), + ) + + assert len(res.events) == 1 + event = res.events[0] + assert event.data["count"] == 4 + timestamps = event.data.get("timestamps", []) + assert len(timestamps) == 1 + + +def test_num_timestamps_pruning(system_info): + """Test that timestamp lists are pruned to num_timestamps""" + # Create dmesg with many occurrences outside collapse interval + dmesg_lines = [] + for i in range(10): + timestamp = f"2026-01-07T10:{i*2:02d}:00,000000-06:00" + dmesg_lines.append(f"kern :err : {timestamp} oom_kill_process") + + dmesg_data = DmesgData(dmesg_content="\n".join(dmesg_lines)) + + analyzer = DmesgAnalyzer(system_info=system_info) + + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, num_timestamps=3, interval_to_collapse_event=60 + ), + ) + + assert len(res.events) == 1 + event = res.events[0] + assert event.data["count"] == 10 + + # Should keep first 3 and last 3 timestamps + timestamps = event.data.get("timestamps", []) + assert len(timestamps) == 6 + assert "10:00:00" in timestamps[0] + assert "10:02:00" in timestamps[1] + assert "10:04:00" in timestamps[2] + assert "10:12:00" in timestamps[3] + assert "10:14:00" in timestamps[4] + assert "10:16:00" in timestamps[5] + + +def test_custom_regex_with_event_collapsing(system_info): + """Test that custom regex works correctly with event collapsing""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 my_custom_driver_error\n" + "kern :err : 2026-01-07T10:00:10,000000-06:00 my_custom_driver_error\n" + "kern :err : 2026-01-07T10:00:20,000000-06:00 my_custom_driver_error\n" + "kern :err : 2026-01-07T10:02:00,000000-06:00 my_custom_driver_error\n" + ) + ) + + custom_regex = [ + { + "regex": r"my_custom_driver_error", + "message": "Custom Driver Error", + "event_category": "SW_DRIVER", + } + ] + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, interval_to_collapse_event=60, num_timestamps=2 + ), + error_regex=custom_regex, + ) + + assert len(res.events) == 1 + event = res.events[0] + assert event.description == "Custom Driver Error" + assert event.data["count"] == 4 + assert event.category == "SW_DRIVER" + + timestamps = event.data.get("timestamps", []) + assert len(timestamps) == 2 + assert "10:00:00" in timestamps[0] + assert "10:02:00" in timestamps[1] + + +def test_multiple_error_types_with_collapsing(system_info): + """Test that different error types are collapsed independently""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:00:10,000000-06:00 IO_PAGE_FAULT\n" + "kern :err : 2026-01-07T10:00:20,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:00:30,000000-06:00 IO_PAGE_FAULT\n" + "kern :err : 2026-01-07T10:01:30,000000-06:00 oom_kill_process\n" + "kern :err : 2026-01-07T10:01:40,000000-06:00 IO_PAGE_FAULT\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, interval_to_collapse_event=60), + ) + + assert len(res.events) == 2 + + oom_events = [e for e in res.events if e.description == "Out of memory error"] + io_fault_events = [e for e in res.events if e.description == "I/O Page Fault"] + + assert len(oom_events) == 1 + assert len(io_fault_events) == 1 + + assert oom_events[0].data["count"] == 3 + assert io_fault_events[0].data["count"] == 3 + + oom_timestamps = oom_events[0].data.get("timestamps", []) + io_timestamps = io_fault_events[0].data.get("timestamps", []) + + assert len(oom_timestamps) == 2 + assert len(io_timestamps) == 2 + + +def test_custom_regex_empty_list(system_info): + """Test that empty custom regex list doesn't break analysis""" + dmesg_data = DmesgData( + dmesg_content="kern :err : 2026-01-07T10:00:00,000000-06:00 oom_kill_process\n" + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), error_regex=[] + ) + + assert len(res.events) == 1 + assert res.events[0].description == "Out of memory error" + + +def test_custom_regex_with_multiline_pattern(system_info): + """Test custom regex with multiline patterns""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2026-01-07T10:00:00,000000-06:00 START_ERROR_BLOCK\n" + "kern :err : 2026-01-07T10:00:01,000000-06:00 error_detail_line1\n" + "kern :err : 2026-01-07T10:00:02,000000-06:00 error_detail_line2\n" + "kern :err : 2026-01-07T10:00:03,000000-06:00 END_ERROR_BLOCK\n" + ) + ) + + custom_regex = [ + { + "regex": r"(START_ERROR_BLOCK.*?)(?:END_ERROR_BLOCK)", + "message": "Multiline Error Block", + "event_category": "SW_DRIVER", + } + ] + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), + error_regex=custom_regex, + ) + + assert len(res.events) >= 1 + multiline_events = [e for e in res.events if e.description == "Multiline Error Block"] + assert len(multiline_events) >= 1 From 6999628dab82811c711d02fd5989abdd9e8183ac Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 29 Jan 2026 12:10:06 -0600 Subject: [PATCH 08/11] README.md updates for DmesgPlugin --- README.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c020fdd1..730d4547 100644 --- a/README.md +++ b/README.md @@ -232,7 +232,9 @@ This would produce the following config: "analysis_range_start": null, "analysis_range_end": null, "check_unknown_dmesg_errors": true, - "exclude_category": null + "exclude_category": null, + "interval_to_collapse_event": 60, + "num_timestamps": 3 } } }, @@ -240,6 +242,59 @@ This would produce the following config: } ``` +**Running DmesgPlugin with a dmesg log file:** + +Instead of collecting dmesg from the system, you can analyze a pre-existing dmesg log file using the `--data` argument: + +```sh +node-scraper --plugin-configs DmesgPlugin --data /path/to/dmesg.log run-plugins +``` + +This will skip the collection phase and directly analyze the provided dmesg.log file. + +**Custom Error Regex Example:** + +You can extend the built-in error detection with custom regex patterns. Create a config file with custom error patterns: + +```json +{ + "global_args": {}, + "plugins": { + "DmesgPlugin": { + "collection_args": { + "dmesg_file": "/path/to/dmesg.log" + }, + "analysis_args": { + "check_unknown_dmesg_errors": false, + "interval_to_collapse_event": 60, + "num_timestamps": 3, + "error_regex": [ + { + "regex": "MY_CUSTOM_ERROR.*", + "message": "My Custom Error Detected", + "event_category": "SW_DRIVER", + "event_priority": 3 + }, + { + "regex": "APPLICATION_CRASH: .*", + "message": "Application Crash", + "event_category": "SW_DRIVER", + "event_priority": 4 + } + ] + } + } + }, + "result_collators": {} +} +``` + +Save this to `dmesg_custom_config.json` and run: + +```sh +node-scraper --plugin-configs dmesg_custom_config.json run-plugins DmesgPlugin +``` + #### **'summary' sub command** The 'summary' subcommand can be used to combine results from multiple runs of node-scraper to a single summary.csv file. Sample run: From 8934328445aae00a87507e5c3a4b4855f664559f Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 29 Jan 2026 12:12:47 -0600 Subject: [PATCH 09/11] fix for cmd --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 730d4547..2b28b67f 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ This would produce the following config: Instead of collecting dmesg from the system, you can analyze a pre-existing dmesg log file using the `--data` argument: ```sh -node-scraper --plugin-configs DmesgPlugin --data /path/to/dmesg.log run-plugins +node-scraper --run-plugins DmesgPlugin --data /path/to/dmesg.log --collection False ``` This will skip the collection phase and directly analyze the provided dmesg.log file. From 4bfdec67fe48ad0164fdc13369693a332e0b929b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 29 Jan 2026 13:51:35 -0600 Subject: [PATCH 10/11] utest fix --- .../plugins/inband/dmesg/analyzer_args.py | 4 +++- .../plugins/inband/dmesg/dmesg_analyzer.py | 6 ++---- test/unit/plugin/test_dmesg_analyzer.py | 19 ++++++++----------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index d1e02156..f8783761 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -23,8 +23,9 @@ # SOFTWARE. # ############################################################################### -from typing import Optional +from typing import Optional, Union +from nodescraper.base.regexanalyzer import ErrorRegex from nodescraper.models import TimeRangeAnalysisArgs @@ -33,3 +34,4 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): exclude_category: Optional[set[str]] = None interval_to_collapse_event: int = 60 num_timestamps: int = 3 + error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = None diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index cd7e0674..ccfe9ce0 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -25,7 +25,7 @@ ############################################################################### import datetime import re -from typing import Optional, Union +from typing import Optional from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer from nodescraper.connection.inband import TextFileArtifact @@ -539,14 +539,12 @@ def analyze_data( self, data: DmesgData, args: Optional[DmesgAnalyzerArgs] = None, - error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = None, ) -> TaskResult: """Analyze dmesg data for errors Args: data (DmesgData): dmesg data to analyze args (Optional[DmesgAnalyzerArgs], optional): dmesg analysis arguments. Defaults to None. - error_regex (Optional[Union[list[ErrorRegex], list[dict]]]): custom error regexes to extend ERROR_REGEX Returns: TaskResult: The result of the analysis containing status and message. @@ -555,7 +553,7 @@ def analyze_data( if not args: args = DmesgAnalyzerArgs() - final_error_regex = self._convert_and_extend_error_regex(error_regex, self.ERROR_REGEX) + final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) if args.analysis_range_start or args.analysis_range_end: self.logger.info( diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 4500baf8..5877f77e 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -497,8 +497,7 @@ def test_custom_regex_dict_passed_to_analyzer(system_info): analyzer = DmesgAnalyzer(system_info=system_info) res = analyzer.analyze_data( dmesg_data, - args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), - error_regex=custom_regex, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, error_regex=custom_regex), ) assert res.status == ExecutionStatus.ERROR @@ -544,7 +543,7 @@ def test_event_collapsing_within_interval(system_info): assert event.data["count"] == 5 timestamps = event.data.get("timestamps", []) - assert len(timestamps) == 3 # First, one at 2min, one at 4min + assert len(timestamps) == 3 def test_event_collapsing_with_different_intervals(system_info): @@ -571,7 +570,6 @@ def test_event_collapsing_with_different_intervals(system_info): timestamps = event.data.get("timestamps", []) assert len(timestamps) == 4 - # Test with 100-second interval - should collapse all res = analyzer.analyze_data( dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, interval_to_collapse_event=100), @@ -586,7 +584,6 @@ def test_event_collapsing_with_different_intervals(system_info): def test_num_timestamps_pruning(system_info): """Test that timestamp lists are pruned to num_timestamps""" - # Create dmesg with many occurrences outside collapse interval dmesg_lines = [] for i in range(10): timestamp = f"2026-01-07T10:{i*2:02d}:00,000000-06:00" @@ -607,7 +604,6 @@ def test_num_timestamps_pruning(system_info): event = res.events[0] assert event.data["count"] == 10 - # Should keep first 3 and last 3 timestamps timestamps = event.data.get("timestamps", []) assert len(timestamps) == 6 assert "10:00:00" in timestamps[0] @@ -641,9 +637,11 @@ def test_custom_regex_with_event_collapsing(system_info): res = analyzer.analyze_data( dmesg_data, args=DmesgAnalyzerArgs( - check_unknown_dmesg_errors=False, interval_to_collapse_event=60, num_timestamps=2 + check_unknown_dmesg_errors=False, + interval_to_collapse_event=60, + num_timestamps=2, + error_regex=custom_regex, ), - error_regex=custom_regex, ) assert len(res.events) == 1 @@ -703,7 +701,7 @@ def test_custom_regex_empty_list(system_info): analyzer = DmesgAnalyzer(system_info=system_info) res = analyzer.analyze_data( - dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), error_regex=[] + dmesg_data, args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, error_regex=[]) ) assert len(res.events) == 1 @@ -732,8 +730,7 @@ def test_custom_regex_with_multiline_pattern(system_info): analyzer = DmesgAnalyzer(system_info=system_info) res = analyzer.analyze_data( dmesg_data, - args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), - error_regex=custom_regex, + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, error_regex=custom_regex), ) assert len(res.events) >= 1 From cdd6e30c0eed28eea96d4e050af5d8a1c4b0e9a1 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 29 Jan 2026 18:39:44 -0600 Subject: [PATCH 11/11] utest fix --- test/unit/plugin/test_dmesg_analyzer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 5877f77e..c14b090c 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -485,7 +485,7 @@ def test_custom_regex_dict_passed_to_analyzer(system_info): "regex": r"custom_error_pattern_\d+", "message": "Custom Error Pattern", "event_category": "SW_DRIVER", - "event_priority": 3, + "event_priority": 4, }, { "regex": r"another_custom_error_\w+", @@ -609,9 +609,9 @@ def test_num_timestamps_pruning(system_info): assert "10:00:00" in timestamps[0] assert "10:02:00" in timestamps[1] assert "10:04:00" in timestamps[2] - assert "10:12:00" in timestamps[3] - assert "10:14:00" in timestamps[4] - assert "10:16:00" in timestamps[5] + assert "10:14:00" in timestamps[3] + assert "10:16:00" in timestamps[4] + assert "10:18:00" in timestamps[5] def test_custom_regex_with_event_collapsing(system_info): @@ -709,7 +709,7 @@ def test_custom_regex_empty_list(system_info): def test_custom_regex_with_multiline_pattern(system_info): - """Test custom regex with multiline patterns""" + """Test custom regex that should NOT match across multiple dmesg lines (each line processed separately)""" dmesg_data = DmesgData( dmesg_content=( "kern :err : 2026-01-07T10:00:00,000000-06:00 START_ERROR_BLOCK\n" @@ -721,8 +721,8 @@ def test_custom_regex_with_multiline_pattern(system_info): custom_regex = [ { - "regex": r"(START_ERROR_BLOCK.*?)(?:END_ERROR_BLOCK)", - "message": "Multiline Error Block", + "regex": r"START_ERROR_BLOCK", + "message": "Start Error Block", "event_category": "SW_DRIVER", } ] @@ -734,5 +734,5 @@ def test_custom_regex_with_multiline_pattern(system_info): ) assert len(res.events) >= 1 - multiline_events = [e for e in res.events if e.description == "Multiline Error Block"] - assert len(multiline_events) >= 1 + start_events = [e for e in res.events if e.description == "Start Error Block"] + assert len(start_events) == 1