From 8f8b59307e72c9f949e4bb3fb3e23006ee083277 Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 12:06:02 +0100 Subject: [PATCH 1/7] Add rule review status --- .../cluster_entities_relationship.toml | 3 ++ automated-rules/cpu_throttling.toml | 3 ++ automated-rules/dedupe_cache_hit_rate.toml | 3 ++ automated-rules/file_descriptors.toml | 3 ++ automated-rules/go_goroutines.toml | 3 ++ automated-rules/go_memstats_heap_objects.toml | 3 ++ automated-rules/go_threads.toml | 3 ++ automated-rules/heap_utilization.toml | 3 ++ .../http_incoming_in_flight_requests.toml | 3 ++ ...ng_request_duration_histogram_seconds.toml | 3 ++ .../load-level/cluster_volume.toml | 3 ++ ...nent_process_message_duration_seconds.toml | 3 ++ ...sor_deployment_enhancement_queue_size.toml | 3 ++ ...tor_deployment_queue_operations_total.toml | 3 ++ ...r_network_flow_queue_operations_total.toml | 3 ++ ...cess_indicator_queue_operations_total.toml | 3 ++ ..._internal_exponential_backoff_seconds.toml | 22 +++++++++++---- ..._k8s_event_ingestion_to_send_duration.toml | 3 ++ ..._sensor_k8s_event_processing_duration.toml | 3 ++ .../rox_sensor_network_flow_buffer_size.toml | 3 ++ ..._flow_manager_purger_duration_seconds.toml | 3 ++ .../rox_sensor_num_pods_in_store.toml | 3 ++ .../rox_sensor_output_channel_size.toml | 3 ++ ...rox_sensor_process_signal_buffer_size.toml | 3 ++ .../rox_sensor_resolver_channel_size.toml | 3 ++ ...x_sensor_resolver_deduping_queue_size.toml | 3 ++ ...ensor_scan_call_duration_milliseconds.toml | 3 ++ automated-rules/rox_sensor_secured_nodes.toml | 3 ++ internal/evaluator/evaluator_common.go | 28 +++++++++++++++++++ internal/evaluator/histogram.go | 1 + internal/reporter/console.go | 12 +++++++- internal/rules/types.go | 6 ++++ internal/tui/view.go | 11 ++++++++ templates/markdown.tmpl | 20 +++++++++---- 34 files changed, 169 insertions(+), 12 deletions(-) diff --git a/automated-rules/cluster_entities_relationship.toml b/automated-rules/cluster_entities_relationship.toml index 3bfc4c2..9a3bb46 100644 --- a/automated-rules/cluster_entities_relationship.toml +++ b/automated-rules/cluster_entities_relationship.toml @@ -1,6 +1,9 @@ rule_type = "composite" display_name = "cluster_entities_relationship" description = "Validate cluster entities relationships" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [composite_config] diff --git a/automated-rules/cpu_throttling.toml b/automated-rules/cpu_throttling.toml index 0d2179b..74cd907 100644 --- a/automated-rules/cpu_throttling.toml +++ b/automated-rules/cpu_throttling.toml @@ -1,6 +1,9 @@ rule_type = "percentage" display_name = "cpu_throttling" description = "CPU throttling percentage" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [percentage_config] numerator = "rox_sensor_process_cpu_nr_throttled" diff --git a/automated-rules/dedupe_cache_hit_rate.toml b/automated-rules/dedupe_cache_hit_rate.toml index 6bcb7a3..0e006ff 100644 --- a/automated-rules/dedupe_cache_hit_rate.toml +++ b/automated-rules/dedupe_cache_hit_rate.toml @@ -1,6 +1,9 @@ rule_type = "cache_hit_rate" display_name = "dedupe_cache_hit_rate" description = "Deduplication cache effectiveness" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [cache_config] hits_metric = "rox_sensor_dedupe_cache_hits" diff --git a/automated-rules/file_descriptors.toml b/automated-rules/file_descriptors.toml index 9872d86..17c28ba 100644 --- a/automated-rules/file_descriptors.toml +++ b/automated-rules/file_descriptors.toml @@ -1,6 +1,9 @@ rule_type = "percentage" display_name = "file_descriptors" description = "File descriptor utilization" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [percentage_config] numerator = "process_open_fds" diff --git a/automated-rules/go_goroutines.toml b/automated-rules/go_goroutines.toml index 62c5b98..48ab782 100644 --- a/automated-rules/go_goroutines.toml +++ b/automated-rules/go_goroutines.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "go_goroutines" display_name = "go_goroutines" description = "Number of goroutines" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 10000 diff --git a/automated-rules/go_memstats_heap_objects.toml b/automated-rules/go_memstats_heap_objects.toml index 3d1fa2b..e8f7d90 100644 --- a/automated-rules/go_memstats_heap_objects.toml +++ b/automated-rules/go_memstats_heap_objects.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "go_memstats_heap_objects" display_name = "go_memstats_heap_objects" description = "Number of heap objects" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 1000000 diff --git a/automated-rules/go_threads.toml b/automated-rules/go_threads.toml index 32d831a..910346d 100644 --- a/automated-rules/go_threads.toml +++ b/automated-rules/go_threads.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "go_threads" display_name = "go_threads" description = "Number of OS threads" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 100 diff --git a/automated-rules/heap_utilization.toml b/automated-rules/heap_utilization.toml index de343ad..c2b31ff 100644 --- a/automated-rules/heap_utilization.toml +++ b/automated-rules/heap_utilization.toml @@ -1,6 +1,9 @@ rule_type = "percentage" display_name = "heap_utilization" description = "Heap memory utilization" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [percentage_config] numerator = "go_memstats_heap_alloc_bytes" diff --git a/automated-rules/http_incoming_in_flight_requests.toml b/automated-rules/http_incoming_in_flight_requests.toml index 996f89b..f483ea5 100644 --- a/automated-rules/http_incoming_in_flight_requests.toml +++ b/automated-rules/http_incoming_in_flight_requests.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "http_incoming_in_flight_requests" display_name = "http_incoming_in_flight_requests" description = "HTTP in-flight requests" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 10 diff --git a/automated-rules/http_incoming_request_duration_histogram_seconds.toml b/automated-rules/http_incoming_request_duration_histogram_seconds.toml index 3539e70..7743c7c 100644 --- a/automated-rules/http_incoming_request_duration_histogram_seconds.toml +++ b/automated-rules/http_incoming_request_duration_histogram_seconds.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "http_incoming_request_duration_histogram_seconds" display_name = "http_incoming_request_duration_histogram_seconds" description = "HTTP request latency distribution" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "seconds" diff --git a/automated-rules/load-level/cluster_volume.toml b/automated-rules/load-level/cluster_volume.toml index bd27857..9d62cbc 100644 --- a/automated-rules/load-level/cluster_volume.toml +++ b/automated-rules/load-level/cluster_volume.toml @@ -23,6 +23,9 @@ rule_type = "load_detection" display_name = "cluster_volume" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [[metrics]] name = "containers" diff --git a/automated-rules/rox_sensor_component_process_message_duration_seconds.toml b/automated-rules/rox_sensor_component_process_message_duration_seconds.toml index dac8b78..46d1970 100644 --- a/automated-rules/rox_sensor_component_process_message_duration_seconds.toml +++ b/automated-rules/rox_sensor_component_process_message_duration_seconds.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "rox_sensor_component_process_message_duration_seconds" display_name = "rox_sensor_component_process_message_duration_seconds" description = "Component process message duration" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "seconds" diff --git a/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml b/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml index 1be12d3..cf9418d 100644 --- a/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml +++ b/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_deployment_enhancement_queue_size" display_name = "rox_sensor_deployment_enhancement_queue_size" description = "Deployment enhancement queue size" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 0 diff --git a/automated-rules/rox_sensor_detector_deployment_queue_operations_total.toml b/automated-rules/rox_sensor_detector_deployment_queue_operations_total.toml index bfaf6d3..1d822bd 100644 --- a/automated-rules/rox_sensor_detector_deployment_queue_operations_total.toml +++ b/automated-rules/rox_sensor_detector_deployment_queue_operations_total.toml @@ -2,6 +2,9 @@ rule_type = "queue_operations" metric_name = "rox_sensor_detector_deployment_queue_operations_total" display_name = "rox_sensor_detector_deployment_queue_operations_total" description = "Deployment queue Add/Remove balance" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [queue_config] operation_label = "Operation" diff --git a/automated-rules/rox_sensor_detector_network_flow_queue_operations_total.toml b/automated-rules/rox_sensor_detector_network_flow_queue_operations_total.toml index b76de93..2bd9f60 100644 --- a/automated-rules/rox_sensor_detector_network_flow_queue_operations_total.toml +++ b/automated-rules/rox_sensor_detector_network_flow_queue_operations_total.toml @@ -2,6 +2,9 @@ rule_type = "queue_operations" metric_name = "rox_sensor_detector_network_flow_queue_operations_total" display_name = "rox_sensor_detector_network_flow_queue_operations_total" description = "Network flow queue Add/Remove balance" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [queue_config] operation_label = "Operation" diff --git a/automated-rules/rox_sensor_detector_process_indicator_queue_operations_total.toml b/automated-rules/rox_sensor_detector_process_indicator_queue_operations_total.toml index e991041..0bc5267 100644 --- a/automated-rules/rox_sensor_detector_process_indicator_queue_operations_total.toml +++ b/automated-rules/rox_sensor_detector_process_indicator_queue_operations_total.toml @@ -2,6 +2,9 @@ rule_type = "queue_operations" metric_name = "rox_sensor_detector_process_indicator_queue_operations_total" display_name = "rox_sensor_detector_process_indicator_queue_operations_total" description = "Process indicator queue Add/Remove balance" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [queue_config] operation_label = "Operation" diff --git a/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml b/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml index 8720016..09e6c76 100644 --- a/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml +++ b/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml @@ -1,7 +1,10 @@ rule_type = "histogram" metric_name = "rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds" display_name = "rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds" -description = "Image scan exponential backoff duration" +description = "Image scan exponential backoff duration. High value means the scanner is overloaded and image scans are being delayed." +reviewed = "Yes, human-reviewed" +last_review_by = "Piotr" +last_review_on = "2026-01-30" [histogram_config] unit = "seconds" @@ -12,12 +15,21 @@ p95_warn = 4.0 [messages] green = "p95={p95:.3f}s, p99={p99:.3f}s (good)" -yellow = "p95={p95:.3f}s, p99={p99:.3f}s (elevated)" -red = "p95={p95:.3f}s, p99={p99:.3f}s (high backoff - scanner issues)" +yellow = """ +p95={p95:.3f}s, p99={p99:.3f}s (elevated) +High value = the system is spending a lot of time waiting before it can scan images, not actually doing work. +That waiting happens only when the scanner is too busy or rate‑limited, so a high number means requests are piling up and scans are delayed. +This delay applies to each scan request being sent to scanner. +""" +red = """ +p95={p95:.3f}s, p99={p99:.3f}s (high backoff - scanner issues). +High value = the system is spending a lot of time waiting before it can scan images, not actually doing work. +That waiting happens only when the scanner is too busy or rate‑limited, so a high number means requests are piling up and scans are delayed. +This delay applies to each scan request being sent to scanner. +""" [remediation] red = "High image scan backoff duration. Check scanner connectivity and performance." yellow = "Monitor image scan backoff trends." -acs_versions = ["4.7+", "4.8+", "4.9+"] - +acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"] diff --git a/automated-rules/rox_sensor_k8s_event_ingestion_to_send_duration.toml b/automated-rules/rox_sensor_k8s_event_ingestion_to_send_duration.toml index b8d0b95..920d228 100644 --- a/automated-rules/rox_sensor_k8s_event_ingestion_to_send_duration.toml +++ b/automated-rules/rox_sensor_k8s_event_ingestion_to_send_duration.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "rox_sensor_k8s_event_ingestion_to_send_duration" display_name = "rox_sensor_k8s_event_ingestion_to_send_duration" description = "K8s event ingestion to send duration" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "seconds" diff --git a/automated-rules/rox_sensor_k8s_event_processing_duration.toml b/automated-rules/rox_sensor_k8s_event_processing_duration.toml index 5c4a3dc..09f2297 100644 --- a/automated-rules/rox_sensor_k8s_event_processing_duration.toml +++ b/automated-rules/rox_sensor_k8s_event_processing_duration.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "rox_sensor_k8s_event_processing_duration" display_name = "rox_sensor_k8s_event_processing_duration" description = "K8s event processing duration" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "seconds" diff --git a/automated-rules/rox_sensor_network_flow_buffer_size.toml b/automated-rules/rox_sensor_network_flow_buffer_size.toml index b551afa..e54dbc3 100644 --- a/automated-rules/rox_sensor_network_flow_buffer_size.toml +++ b/automated-rules/rox_sensor_network_flow_buffer_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_network_flow_buffer_size" display_name = "rox_sensor_network_flow_buffer_size" description = "Network flow buffer size monitoring" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 100 diff --git a/automated-rules/rox_sensor_network_flow_manager_purger_duration_seconds.toml b/automated-rules/rox_sensor_network_flow_manager_purger_duration_seconds.toml index dda6a35..e632fd5 100644 --- a/automated-rules/rox_sensor_network_flow_manager_purger_duration_seconds.toml +++ b/automated-rules/rox_sensor_network_flow_manager_purger_duration_seconds.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "rox_sensor_network_flow_manager_purger_duration_seconds" display_name = "rox_sensor_network_flow_manager_purger_duration_seconds" description = "Network flow manager purger duration" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "seconds" diff --git a/automated-rules/rox_sensor_num_pods_in_store.toml b/automated-rules/rox_sensor_num_pods_in_store.toml index 3fde613..ff34e3f 100644 --- a/automated-rules/rox_sensor_num_pods_in_store.toml +++ b/automated-rules/rox_sensor_num_pods_in_store.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_num_pods_in_store" display_name = "rox_sensor_num_pods_in_store" description = "Number of pods tracked by Sensor" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 0 diff --git a/automated-rules/rox_sensor_output_channel_size.toml b/automated-rules/rox_sensor_output_channel_size.toml index ec13bc2..48a679f 100644 --- a/automated-rules/rox_sensor_output_channel_size.toml +++ b/automated-rules/rox_sensor_output_channel_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_output_channel_size" display_name = "rox_sensor_output_channel_size" description = "Output channel size (messages to Central)" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 50 diff --git a/automated-rules/rox_sensor_process_signal_buffer_size.toml b/automated-rules/rox_sensor_process_signal_buffer_size.toml index 9e64ece..41bebba 100644 --- a/automated-rules/rox_sensor_process_signal_buffer_size.toml +++ b/automated-rules/rox_sensor_process_signal_buffer_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_process_signal_buffer_size" display_name = "rox_sensor_process_signal_buffer_size" description = "Process signal buffer size" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 100 diff --git a/automated-rules/rox_sensor_resolver_channel_size.toml b/automated-rules/rox_sensor_resolver_channel_size.toml index d05a004..7cf9f22 100644 --- a/automated-rules/rox_sensor_resolver_channel_size.toml +++ b/automated-rules/rox_sensor_resolver_channel_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_resolver_channel_size" display_name = "rox_sensor_resolver_channel_size" description = "Resolver channel size" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 10 diff --git a/automated-rules/rox_sensor_resolver_deduping_queue_size.toml b/automated-rules/rox_sensor_resolver_deduping_queue_size.toml index 6c8111d..b4c7ab8 100644 --- a/automated-rules/rox_sensor_resolver_deduping_queue_size.toml +++ b/automated-rules/rox_sensor_resolver_deduping_queue_size.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_resolver_deduping_queue_size" display_name = "rox_sensor_resolver_deduping_queue_size" description = "Resolver deduping queue size" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 10 diff --git a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml index d97e9fb..ab96828 100644 --- a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml +++ b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml @@ -2,6 +2,9 @@ rule_type = "histogram" metric_name = "rox_sensor_scan_call_duration_milliseconds" display_name = "rox_sensor_scan_call_duration_milliseconds" description = "Scan call duration" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [histogram_config] unit = "milliseconds" diff --git a/automated-rules/rox_sensor_secured_nodes.toml b/automated-rules/rox_sensor_secured_nodes.toml index cba8ebd..4de204e 100644 --- a/automated-rules/rox_sensor_secured_nodes.toml +++ b/automated-rules/rox_sensor_secured_nodes.toml @@ -2,6 +2,9 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_secured_nodes" display_name = "rox_sensor_secured_nodes" description = "Number of secured nodes" +reviewed = "No, AI-generated" +last_review_by = "" +last_review_on = "never" [thresholds] low = 0 diff --git a/internal/evaluator/evaluator_common.go b/internal/evaluator/evaluator_common.go index ef14a36..2147c77 100644 --- a/internal/evaluator/evaluator_common.go +++ b/internal/evaluator/evaluator_common.go @@ -1,6 +1,7 @@ package evaluator import ( + "fmt" "time" "github.com/stackrox/sensor-metrics-analyzer/internal/parser" @@ -71,6 +72,31 @@ func getRemediation(rule rules.Rule, status rules.Status) string { } } +func applyReviewMetadata(rule rules.Rule) string { + review := rule.Reviewed + by := rule.LastReviewBy + on := rule.LastReviewOn + + switch { + case review != "" && by != "" && on != "": + return fmt.Sprintf("%s (last review: %s on %s)", review, by, on) + case review != "" && by != "": + return fmt.Sprintf("%s (last review: %s)", review, by) + case review != "" && on != "": + return fmt.Sprintf("%s (last review: %s)", review, on) + case review != "": + return review + case by != "" && on != "": + return fmt.Sprintf("Last review: %s on %s", by, on) + case by != "": + return fmt.Sprintf("Last review: %s", by) + case on != "": + return fmt.Sprintf("Last review: %s", on) + } + + return "review status unavailable" +} + // EvaluateAllRules evaluates all rules against metrics func EvaluateAllRules(rulesList []rules.Rule, metrics parser.MetricsData, loadLevel rules.LoadLevel, acsVersion string) rules.AnalysisReport { report := rules.AnalysisReport{ @@ -108,6 +134,8 @@ func EvaluateAllRules(rulesList []rules.Rule, metrics parser.MetricsData, loadLe result = EvaluateCorrelation(rule, metrics, result) } + result.ReviewStatus = applyReviewMetadata(rule) + // Add potential actions (user-facing) result.Remediation = getRemediation(rule, result.Status) result.PotentialActionUser = result.Remediation diff --git a/internal/evaluator/histogram.go b/internal/evaluator/histogram.go index fed7ace..5dbcad2 100644 --- a/internal/evaluator/histogram.go +++ b/internal/evaluator/histogram.go @@ -125,6 +125,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD Details: []string{}, Timestamp: time.Now(), } + result.ReviewStatus = "Automatically generated rule; review by the code author" // Get histogram buckets bucketMetricName := baseName + "_bucket" diff --git a/internal/reporter/console.go b/internal/reporter/console.go index 5f05075..088c038 100644 --- a/internal/reporter/console.go +++ b/internal/reporter/console.go @@ -65,6 +65,9 @@ func GenerateConsole(report rules.AnalysisReport) string { result.WriteString(color.New(color.Bold).Sprintf("%s\n", r.RuleName)) result.WriteString(color.RedString(" Status: RED\n")) result.WriteString(fmt.Sprintf(" Message: %s\n", r.Message)) + if r.ReviewStatus != "" { + result.WriteString(fmt.Sprintf(" Review: %s\n", r.ReviewStatus)) + } if len(r.Details) > 0 { result.WriteString(color.New(color.FgYellow).Sprint(" Details:\n")) for _, detail := range r.Details { @@ -93,6 +96,9 @@ func GenerateConsole(report rules.AnalysisReport) string { result.WriteString(color.New(color.Bold).Sprintf("%s\n", r.RuleName)) result.WriteString(color.YellowString(" Status: YELLOW\n")) result.WriteString(fmt.Sprintf(" Message: %s\n", r.Message)) + if r.ReviewStatus != "" { + result.WriteString(fmt.Sprintf(" Review: %s\n", r.ReviewStatus)) + } if len(r.Details) > 0 { result.WriteString(color.New(color.FgYellow).Sprint(" Details:\n")) for _, detail := range r.Details { @@ -119,7 +125,11 @@ func GenerateConsole(report rules.AnalysisReport) string { result.WriteString(color.New(color.Bold, color.FgGreen).Sprint("🟢 Healthy Metrics\n\n")) for _, r := range greenResults { result.WriteString(color.GreenString(" ✓ ")) - result.WriteString(fmt.Sprintf("%s: %s\n", r.RuleName, r.Message)) + if r.ReviewStatus != "" { + result.WriteString(fmt.Sprintf("%s: %s (review: %s)\n", r.RuleName, r.Message, r.ReviewStatus)) + } else { + result.WriteString(fmt.Sprintf("%s: %s\n", r.RuleName, r.Message)) + } } } diff --git a/internal/rules/types.go b/internal/rules/types.go index 7eb8482..e6a45e7 100644 --- a/internal/rules/types.go +++ b/internal/rules/types.go @@ -89,6 +89,11 @@ type Rule struct { DisplayName string `toml:"display_name"` Description string `toml:"description"` + // Review metadata (optional) + Reviewed string `toml:"reviewed"` + LastReviewBy string `toml:"last_review_by"` + LastReviewOn string `toml:"last_review_on"` + // Type-specific configurations GaugeConfig *GaugeConfig `toml:"gauge_config"` PercentageConfig *PercentageConfig `toml:"percentage_config"` @@ -194,6 +199,7 @@ type EvaluationResult struct { Message string Value float64 Details []string + ReviewStatus string Remediation string // Legacy field (use PotentialActionUser/Developer) PotentialActionUser string PotentialActionDeveloper string diff --git a/internal/tui/view.go b/internal/tui/view.go index d95eeda..fbc1ece 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -226,6 +226,17 @@ func (m Model) viewDetail() string { } detail.WriteString("\n") + // Review status + if result.ReviewStatus != "" { + detail.WriteString(detailLabelStyle.Render("Review:")) + detail.WriteString("\n") + wrappedReview := wordWrap(result.ReviewStatus, messageWidth) + for _, line := range strings.Split(wrappedReview, "\n") { + detail.WriteString(fmt.Sprintf(" %s\n", line)) + } + detail.WriteString("\n") + } + // Value if result.Value != 0 { detail.WriteString(detailLabelStyle.Render("Value: ")) diff --git a/templates/markdown.tmpl b/templates/markdown.tmpl index 7353844..ab9f7f2 100644 --- a/templates/markdown.tmpl +++ b/templates/markdown.tmpl @@ -19,12 +19,14 @@ {{ if gt $i 0 }} --- {{ end }} -### {{ $r.RuleName }} -#### Status -RED +### 🔴 {{ $r.RuleName }} #### Message {{ $r.Message }} +{{ if $r.ReviewStatus }} +#### Review status +{{ $r.ReviewStatus }} +{{ end }} {{ if gt (len $r.Details) 0 }} #### Details {{ range $r.Details }} @@ -52,12 +54,14 @@ RED {{ if gt $i 0 }} --- {{ end }} -### {{ $r.RuleName }} -#### Status -YELLOW +### 🟡 {{ $r.RuleName }} #### Message {{ $r.Message }} +{{ if $r.ReviewStatus }} +#### Review status +{{ $r.ReviewStatus }} +{{ end }} {{ if gt (len $r.Details) 0 }} #### Details {{ range $r.Details }} @@ -81,7 +85,11 @@ YELLOW ## 🟢 Healthy Metrics {{ range .GreenResults }} +{{ if .ReviewStatus }} +- **{{.RuleName}}:** {{.Message}} _(review: {{.ReviewStatus}})_ +{{ else }} - **{{.RuleName}}:** {{.Message}} {{ end }} +{{ end }} {{ end }} From 0f50333775ecbfa32f87004bd9901e2b13cd791e Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 13:47:57 +0100 Subject: [PATCH 2/7] Fix and improve rules --- ..._internal_exponential_backoff_seconds.toml | 4 ++-- .../rox_sensor_network_flow_buffer_size.toml | 3 +++ ...ensor_scan_call_duration_milliseconds.toml | 22 +++++++++++++------ 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml b/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml index 09e6c76..a4fcc44 100644 --- a/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml +++ b/automated-rules/rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds.toml @@ -29,7 +29,7 @@ This delay applies to each scan request being sent to scanner. """ [remediation] -red = "High image scan backoff duration. Check scanner connectivity and performance." -yellow = "Monitor image scan backoff trends." +red = "High image scan backoff duration. Check scanner connectivity and performance. Also check the `rox_sensor_scan_call_duration_milliseconds` metric." +yellow = "Monitor image scan backoff trends. Also check the `rox_sensor_scan_call_duration_milliseconds` metric." acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"] diff --git a/automated-rules/rox_sensor_network_flow_buffer_size.toml b/automated-rules/rox_sensor_network_flow_buffer_size.toml index e54dbc3..066ce9b 100644 --- a/automated-rules/rox_sensor_network_flow_buffer_size.toml +++ b/automated-rules/rox_sensor_network_flow_buffer_size.toml @@ -14,14 +14,17 @@ higher_is_worse = true [load_level_thresholds.low] low = 100 high = 150 +higher_is_worse = true [load_level_thresholds.medium] low = 150 high = 250 +higher_is_worse = true [load_level_thresholds.high] low = 200 high = 300 +higher_is_worse = true [messages] green = "Buffer size: {value} (default: 100, max: 300)" diff --git a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml index ab96828..5381166 100644 --- a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml +++ b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml @@ -1,10 +1,10 @@ rule_type = "histogram" metric_name = "rox_sensor_scan_call_duration_milliseconds" display_name = "rox_sensor_scan_call_duration_milliseconds" -description = "Scan call duration" -reviewed = "No, AI-generated" -last_review_by = "" -last_review_on = "never" +description = "Time it takes the secured-cluster scanner to process a request" +reviewed = "Yes, human-reviewed" +last_review_by = "Piotr" +last_review_on = "2026-01-30" [histogram_config] unit = "milliseconds" @@ -15,12 +15,20 @@ p95_warn = 2000 [messages] green = "p95={p95:.0f}ms, p99={p99:.0f}ms (good)" -yellow = "p95={p95:.0f}ms, p99={p99:.0f}ms (elevated)" -red = "p95={p95:.0f}ms, p99={p99:.0f}ms (high latency)" +yellow = """ +p95={p95:.0f}ms, p99={p99:.0f}ms (elevated) +Secured-cluster scanner (delegated scanning) is responding slowly and slowing down sensor pipelines. +If the trend continues it may lead to Sensor consuming significant memory and degraded performance. +""" +red = """ +p95={p95:.0f}ms, p99={p99:.0f}ms (high latency) +Secured-cluster scanner (delegated scanning) is responding slowly and slowing down sensor pipelines. +Check metric `rox_sensor_k8s_event_ingestion_to_send_duration` if it's status is red, then sensor is most probably already being chocked and may start consume significant amounts of memory. +""" [remediation] red = "High scan call latency. Check scanner performance and connectivity." yellow = "Monitor scan call latency trends." -acs_versions = ["4.7+", "4.8+", "4.9+"] +acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"] From 4d74f0361d15a8b0b2128a783530fc7a050158fe Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 14:01:32 +0100 Subject: [PATCH 3/7] Add tests for gauge threshold type rules --- internal/evaluator/gauge_threshold_test.go | 150 +++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 internal/evaluator/gauge_threshold_test.go diff --git a/internal/evaluator/gauge_threshold_test.go b/internal/evaluator/gauge_threshold_test.go new file mode 100644 index 0000000..b677225 --- /dev/null +++ b/internal/evaluator/gauge_threshold_test.go @@ -0,0 +1,150 @@ +package evaluator + +import ( + "testing" + + "github.com/stackrox/sensor-metrics-analyzer/internal/parser" + "github.com/stackrox/sensor-metrics-analyzer/internal/rules" +) + +func TestGaugeThresholdsGeneric(t *testing.T) { + tests := []struct { + name string + rule rules.Rule + value float64 + expected rules.Status + }{ + { + name: "higher is worse below low is green", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: true, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 10, + expected: rules.StatusGreen, + }, + { + name: "higher is worse between low/high is yellow", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: true, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 150, + expected: rules.StatusYellow, + }, + { + name: "higher is worse at/above high is red", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: true, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 200, + expected: rules.StatusRed, + }, + { + name: "lower is worse above high is green", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: false, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 250, + expected: rules.StatusGreen, + }, + { + name: "lower is worse between low/high is yellow", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: false, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 150, + expected: rules.StatusYellow, + }, + { + name: "lower is worse below low is red", + rule: rules.Rule{ + RuleType: rules.RuleTypeGauge, + MetricName: "test_metric", + Thresholds: rules.Thresholds{ + Low: 100, + High: 200, + HigherIsWorse: false, + }, + Messages: rules.Messages{ + Green: "green", + Yellow: "yellow", + Red: "red", + }, + }, + value: 50, + expected: rules.StatusRed, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + metrics := parser.MetricsData{ + "test_metric": { + Name: "test_metric", + Values: []parser.MetricValue{ + {Labels: map[string]string{}, Value: tt.value}, + }, + }, + } + + result := EvaluateGauge(tt.rule, metrics, rules.LoadLevelMedium) + if result.Status != tt.expected { + t.Fatalf("expected %s, got %s (value %.2f)", tt.expected, result.Status, tt.value) + } + }) + } +} From da961762b77dda1497d3824b68b224e6ee6f7cbb Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 14:07:57 +0100 Subject: [PATCH 4/7] Improve rule rox_sensor_resolver_channel_size --- .../rox_sensor_resolver_channel_size.toml | 45 ++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/automated-rules/rox_sensor_resolver_channel_size.toml b/automated-rules/rox_sensor_resolver_channel_size.toml index 7cf9f22..277abcc 100644 --- a/automated-rules/rox_sensor_resolver_channel_size.toml +++ b/automated-rules/rox_sensor_resolver_channel_size.toml @@ -1,10 +1,22 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_resolver_channel_size" display_name = "rox_sensor_resolver_channel_size" -description = "Resolver channel size" -reviewed = "No, AI-generated" -last_review_by = "" -last_review_on = "never" +description = """ +Resolver channel size. +This metric is the number of elemets waiting in the Sensor’s internal queue of Kubernetes events waiting to be “resolved” +(enriched with details) before they can be sent onward. + +Here “resolving” means turning a raw Kubernetes event into a fully‑enriched deployment update that Sensor can send downstream. +That includes: +(1) Determining which deployments are affected by the event (the “deployment references”). +(2) Loading the deployment from Sensor’s store and (if needed) rebuilding it with dependencies (RBAC permission level, service exposure info, local registry images). +(3) Updating the endpoint manager for create/update. +(4) Deciding whether to emit an event and trigger deploy‑time detection. +""" + +reviewed = "Yes, by human" +last_review_by = "Piotr" +last_review_on = "30-01-2026" [thresholds] low = 10 @@ -13,12 +25,31 @@ higher_is_worse = true [messages] green = "{value:.0f} items queued (internal bottleneck if growing)" -yellow = "{value:.0f} items queued (elevated)" -red = "{value:.0f} items queued (backed up - internal processing issue)" +yellow = """ +{value:.0f} items queued (elevated) + +Events take longer to process, which can make inventory/alerts lag behind what’s happening in the cluster. +Common causes of high values: +(1) A surge of Kubernetes events (e.g., large deployments, many changes at once). +(2) Resolver work is slower than usual (expensive lookups/enrichment). +(3) Resource pressure on the Sensor pod (CPU throttling, memory pressure, GC pauses). +(4) Downstream bottleneck (later stages are slow, so resolver can’t drain its queue fast enough). +(5) Network or Central-side slowness that indirectly backs up the pipeline. +""" +red = """ +{value:.0f} items queued (backed up - internal processing issue). + +Common causes of high values: +(1) A surge of Kubernetes events (e.g., large deployments, many changes at once). +(2) Resolver work is slower than usual (expensive lookups/enrichment). +(3) Resource pressure on the Sensor pod (CPU throttling, memory pressure, GC pauses). +(4) Downstream bottleneck (later stages are slow, so resolver can’t drain its queue fast enough). +(5) Network or Central-side slowness that indirectly backs up the pipeline. +""" [remediation] red = "Resolver channel backed up. Check internal processing bottlenecks. Review resolver performance." yellow = "Monitor resolver channel size trends." -acs_versions = ["4.7+", "4.8+", "4.9+"] +acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"] From cc50c10843f167aaa887919292690ffed654ee62 Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:51:03 +0100 Subject: [PATCH 5/7] Review and polish rules --- .../cluster_entities_relationship.toml | 22 ++++++++++++++----- automated-rules/go_memstats_heap_objects.toml | 6 ++--- ...sor_deployment_enhancement_queue_size.toml | 9 +++++--- ..._sensor_k8s_event_processing_duration.toml | 20 +++++++++++------ ...ensor_scan_call_duration_milliseconds.toml | 4 ++-- 5 files changed, 40 insertions(+), 21 deletions(-) diff --git a/automated-rules/cluster_entities_relationship.toml b/automated-rules/cluster_entities_relationship.toml index 9a3bb46..3724c49 100644 --- a/automated-rules/cluster_entities_relationship.toml +++ b/automated-rules/cluster_entities_relationship.toml @@ -1,9 +1,12 @@ rule_type = "composite" display_name = "cluster_entities_relationship" -description = "Validate cluster entities relationships" -reviewed = "No, AI-generated" -last_review_by = "" -last_review_on = "never" +description = """ +Validate cluster entities relation of number of containers, their endpoints (ports), and IP addresses. +""" + +reviewed = "Partially, by a human" +last_review_by = "Piotr" +last_review_on = "30-01-2026" [composite_config] @@ -31,7 +34,11 @@ numerator = "endpoints" denominator = "containers" min_ratio = 0.5 status = "yellow" -message = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low endpoint ratio" +message = """ +Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low endpoint to container ratio. +This roughly means that more than half of the containers have 0 endpoints (ports). +This is unusual, however possible. +""" [[composite_config.checks]] check_type = "ratio" @@ -39,7 +46,10 @@ numerator = "ips" denominator = "endpoints" min_ratio = 0.5 status = "yellow" -message = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low IP ratio" +message = """ +Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low IP to endpoints ratio. +We expect that there should be more unique IP addresses assigned to this number of containers and endpoints. +""" [messages] green = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs (healthy ratios)" diff --git a/automated-rules/go_memstats_heap_objects.toml b/automated-rules/go_memstats_heap_objects.toml index e8f7d90..4c967d2 100644 --- a/automated-rules/go_memstats_heap_objects.toml +++ b/automated-rules/go_memstats_heap_objects.toml @@ -12,9 +12,9 @@ high = 10000000 higher_is_worse = true [messages] -green = "{value:.0f} heap objects (healthy)" -yellow = "{value:.0f} heap objects (elevated - monitor memory)" -red = "{value:.0f} heap objects (very high - significant memory use)" +green = "{value_human} heap objects (healthy)" +yellow = "{value_human} heap objects (elevated - monitor memory)" +red = "{value_human} heap objects (very high - significant memory use)" [remediation] red = "Very high heap object count. Check for memory leaks. Review object allocation patterns." diff --git a/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml b/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml index cf9418d..82dcc54 100644 --- a/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml +++ b/automated-rules/rox_sensor_deployment_enhancement_queue_size.toml @@ -1,13 +1,16 @@ rule_type = "gauge_threshold" metric_name = "rox_sensor_deployment_enhancement_queue_size" display_name = "rox_sensor_deployment_enhancement_queue_size" -description = "Deployment enhancement queue size" +description = """ +Number of deployments waiting for enhancement. +""" + reviewed = "No, AI-generated" last_review_by = "" last_review_on = "never" [thresholds] -low = 0 +low = 1 high = 2 higher_is_worse = true @@ -20,5 +23,5 @@ red = "{value:.0f} items queued (backed up - processing issues)" red = "Deployment enhancement queue backed up. Check processing capacity and bottlenecks." yellow = "Monitor queue size. Investigate if backlog persists." -acs_versions = ["4.7+", "4.8+", "4.9+"] +acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"] diff --git a/automated-rules/rox_sensor_k8s_event_processing_duration.toml b/automated-rules/rox_sensor_k8s_event_processing_duration.toml index 09f2297..dca03a8 100644 --- a/automated-rules/rox_sensor_k8s_event_processing_duration.toml +++ b/automated-rules/rox_sensor_k8s_event_processing_duration.toml @@ -1,17 +1,23 @@ rule_type = "histogram" metric_name = "rox_sensor_k8s_event_processing_duration" display_name = "rox_sensor_k8s_event_processing_duration" -description = "K8s event processing duration" -reviewed = "No, AI-generated" -last_review_by = "" -last_review_on = "never" +description = """ +K8s event processing duration in milliseconds. +It measures how long the listener’s dispatcher takes to process a single Kubernetes event, +from right before ProcessEvent starts until right after it finishes and the metric is recorded. +It does not include resolver, detector, or sending to Central. +""" + +reviewed = "Partially, by a human" +last_review_by = "Piotr" +last_review_on = "30-01-2026" [histogram_config] -unit = "seconds" +unit = "ms" [thresholds] -p95_good = 0.1 -p95_warn = 1.0 +p95_good = 256 +p95_warn = 512 [messages] green = "p95={p95:.3f}s, p99={p99:.3f}s (good)" diff --git a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml index 5381166..88c5e43 100644 --- a/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml +++ b/automated-rules/rox_sensor_scan_call_duration_milliseconds.toml @@ -16,12 +16,12 @@ p95_warn = 2000 [messages] green = "p95={p95:.0f}ms, p99={p99:.0f}ms (good)" yellow = """ -p95={p95:.0f}ms, p99={p99:.0f}ms (elevated) +Elevated latency when calling scanner. Secured-cluster scanner (delegated scanning) is responding slowly and slowing down sensor pipelines. If the trend continues it may lead to Sensor consuming significant memory and degraded performance. """ red = """ -p95={p95:.0f}ms, p99={p99:.0f}ms (high latency) +High latency when calling scanner. Secured-cluster scanner (delegated scanning) is responding slowly and slowing down sensor pipelines. Check metric `rox_sensor_k8s_event_ingestion_to_send_duration` if it's status is red, then sensor is most probably already being chocked and may start consume significant amounts of memory. """ From a85ab2c2f55bfaadb8d1604c7e50aa21b31e4010 Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:51:45 +0100 Subject: [PATCH 6/7] Make numbers easier to read by humans --- cmd/metrics-analyzer/main.go | 4 --- internal/evaluator/gauge.go | 38 +++++++++++++++----- internal/evaluator/histogram.go | 61 +++++++++++++++++++++++++++------ templates/markdown.tmpl | 4 +++ 4 files changed, 85 insertions(+), 22 deletions(-) diff --git a/cmd/metrics-analyzer/main.go b/cmd/metrics-analyzer/main.go index 52d9705..41ee791 100644 --- a/cmd/metrics-analyzer/main.go +++ b/cmd/metrics-analyzer/main.go @@ -210,10 +210,6 @@ func listRulesCommand() { } } -func extractClusterName(filename string) string { - return analyzer.ExtractClusterName(filename) -} - func printUsage() { fmt.Println("Usage: metrics-analyzer [options]") fmt.Println() diff --git a/internal/evaluator/gauge.go b/internal/evaluator/gauge.go index 15366d0..8ba7478 100644 --- a/internal/evaluator/gauge.go +++ b/internal/evaluator/gauge.go @@ -3,6 +3,7 @@ package evaluator import ( "fmt" "regexp" + "strconv" "strings" "time" @@ -36,17 +37,21 @@ func EvaluateGauge(rule rules.Rule, metrics parser.MetricsData, loadLevel rules. // Select thresholds based on load level thresholds := selectThresholds(rule, loadLevel) + extras := map[string]interface{}{ + "value_human": formatHumanNumberGauge(value), + } + // Evaluate thresholds if thresholds.HigherIsWorse { if value < thresholds.Low { result.Status = rules.StatusGreen - result.Message = interpolate(rule.Messages.Green, value, nil) + result.Message = interpolate(rule.Messages.Green, value, extras) } else if value < thresholds.High { result.Status = rules.StatusYellow - result.Message = interpolate(rule.Messages.Yellow, value, nil) + result.Message = interpolate(rule.Messages.Yellow, value, extras) } else { result.Status = rules.StatusRed - result.Message = interpolate(rule.Messages.Red, value, nil) + result.Message = interpolate(rule.Messages.Red, value, extras) } } else { // Lower is worse (inverted) - special case for zero checks @@ -54,22 +59,22 @@ func EvaluateGauge(rule rules.Rule, metrics parser.MetricsData, loadLevel rules. // Zero check: > 0 is good, == 0 is bad if value > 0 { result.Status = rules.StatusGreen - result.Message = interpolate(rule.Messages.Green, value, nil) + result.Message = interpolate(rule.Messages.Green, value, extras) } else { result.Status = rules.StatusRed - result.Message = interpolate(rule.Messages.Red, value, nil) + result.Message = interpolate(rule.Messages.Red, value, extras) } } else { // Normal inverted logic if value >= thresholds.High { result.Status = rules.StatusGreen - result.Message = interpolate(rule.Messages.Green, value, nil) + result.Message = interpolate(rule.Messages.Green, value, extras) } else if value >= thresholds.Low { result.Status = rules.StatusYellow - result.Message = interpolate(rule.Messages.Yellow, value, nil) + result.Message = interpolate(rule.Messages.Yellow, value, extras) } else { result.Status = rules.StatusRed - result.Message = interpolate(rule.Messages.Red, value, nil) + result.Message = interpolate(rule.Messages.Red, value, extras) } } } @@ -114,3 +119,20 @@ func interpolate(template string, value float64, extras map[string]interface{}) return result } + +func formatHumanNumberGauge(value float64) string { + raw := strconv.FormatFloat(value, 'f', 0, 64) + sign := "" + if strings.HasPrefix(raw, "-") { + sign = "-" + raw = strings.TrimPrefix(raw, "-") + } + var grouped strings.Builder + for i, r := range raw { + if i > 0 && (len(raw)-i)%3 == 0 { + grouped.WriteString(" ") + } + grouped.WriteRune(r) + } + return sign + grouped.String() +} diff --git a/internal/evaluator/histogram.go b/internal/evaluator/histogram.go index 5dbcad2..d43f637 100644 --- a/internal/evaluator/histogram.go +++ b/internal/evaluator/histogram.go @@ -2,6 +2,7 @@ package evaluator import ( "fmt" + "math" "sort" "strconv" "strings" @@ -47,28 +48,42 @@ func EvaluateHistogram(rule rules.Rule, metrics parser.MetricsData, loadLevel ru return result } - // Calculate P95 and P99 + // Calculate P50, P75, P95 and P99 + p50Threshold := totalCount * 0.50 + p75Threshold := totalCount * 0.75 p95Threshold := totalCount * 0.95 p99Threshold := totalCount * 0.99 - var p95, p99 float64 + var p50, p75, p95, p99 float64 for _, bucket := range buckets { + if p50 == 0 && bucket.Count >= p50Threshold { + p50 = bucket.Le + } + if p75 == 0 && bucket.Count >= p75Threshold { + p75 = bucket.Le + } if p95 == 0 && bucket.Count >= p95Threshold { p95 = bucket.Le } if p99 == 0 && bucket.Count >= p99Threshold { p99 = bucket.Le } - if p95 > 0 && p99 > 0 { + if p50 > 0 && p75 > 0 && p95 > 0 && p99 > 0 { break } } result.Value = p95 + unit := "" + if rule.HistogramConfig != nil { + unit = strings.TrimSpace(rule.HistogramConfig.Unit) + } result.Details = append(result.Details, - fmt.Sprintf("p95: %.3f", p95), - fmt.Sprintf("p99: %.3f", p99), - fmt.Sprintf("count: %.0f", totalCount), + fmt.Sprintf("p50: %s (i.e., 50%% of the observations are below this value)", formatHistogramValue(p50, unit)), + fmt.Sprintf("p75: %s (i.e., 75%% of the observations are below this value)", formatHistogramValue(p75, unit)), + fmt.Sprintf("p95: %s (i.e., 95%% of the observations are below this value)", formatHistogramValue(p95, unit)), + fmt.Sprintf("p99: %s (i.e., 99%% of the observations are below this value)", formatHistogramValue(p99, unit)), + fmt.Sprintf("count: %s", formatHumanInteger(totalCount)), ) // Select thresholds based on load level @@ -125,7 +140,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD Details: []string{}, Timestamp: time.Now(), } - result.ReviewStatus = "Automatically generated rule; review by the code author" + result.ReviewStatus = "Automatically generated rule; reviewed by the code author at the time of implementation." // Get histogram buckets bucketMetricName := baseName + "_bucket" @@ -220,8 +235,8 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD result.Details = append(result.Details, "Metric Description: "+bucketMetric.Help) } result.Details = append(result.Details, - "Total Number of Observations: "+formatHumanNumber(worstTotalCount)+" unit", - "Observations in +Inf bucket: "+formatHumanNumber(worstInfObservations)+" unit", + "Total Number of Observations: "+formatHumanNumber(worstTotalCount), + "Observations in +Inf bucket: "+formatHumanNumber(worstInfObservations), "Percentage of observations in +Inf bucket: "+formatHumanNumber(worstInfPercentage)+" %", "Highest non-infinity bucket: "+formatHumanNumber(worstHighestFiniteLe)+" unit", ) @@ -257,7 +272,15 @@ func getSeriesKey(labels map[string]string) string { } func formatHumanNumber(value float64) string { - raw := strconv.FormatFloat(value, 'f', 2, 64) + return formatHumanNumberWithPrecision(value, 2) +} + +func formatHumanInteger(value float64) string { + return formatHumanNumberWithPrecision(value, 0) +} + +func formatHumanNumberWithPrecision(value float64, precision int) string { + raw := strconv.FormatFloat(value, 'f', precision, 64) sign := "" if strings.HasPrefix(raw, "-") { sign = "-" @@ -281,3 +304,21 @@ func formatHumanNumber(value float64) string { } return sign + grouped.String() } + +func formatHistogramValue(value float64, unit string) string { + formatted := formatHumanNumber(value) + if value == math.Trunc(value) { + formatted = formatHumanInteger(value) + } + unit = strings.TrimSpace(strings.ToLower(unit)) + if unit == "" { + return formatted + } + switch unit { + case "milliseconds", "millisecond", "ms": + return formatted + " ms" + case "seconds", "second", "s": + return formatted + " s" + } + return formatted + " " + unit +} diff --git a/templates/markdown.tmpl b/templates/markdown.tmpl index ab9f7f2..4a1a7fa 100644 --- a/templates/markdown.tmpl +++ b/templates/markdown.tmpl @@ -25,6 +25,8 @@ {{ $r.Message }} {{ if $r.ReviewStatus }} #### Review status +This alert was generated by evaluating a rule. That rule was reviewed by: + {{ $r.ReviewStatus }} {{ end }} {{ if gt (len $r.Details) 0 }} @@ -60,6 +62,8 @@ {{ $r.Message }} {{ if $r.ReviewStatus }} #### Review status +This warning was generated by evaluating a rule. That rule was reviewed by: + {{ $r.ReviewStatus }} {{ end }} {{ if gt (len $r.Details) 0 }} From e84b2b2301b5b9948203d819eab0e02e8dec4250 Mon Sep 17 00:00:00 2001 From: Piotr Rygielski <114479+vikin91@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:52:04 +0100 Subject: [PATCH 7/7] Bump version to 0.0.3 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 4e379d2..bcab45a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.2 +0.0.3