Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.2
0.0.3
19 changes: 16 additions & 3 deletions automated-rules/cluster_entities_relationship.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
rule_type = "composite"
display_name = "cluster_entities_relationship"
description = "Validate cluster entities relationships"
description = """
Validate cluster entities relation of number of containers, their endpoints (ports), and IP addresses.
"""

reviewed = "Partially, by a human"
last_review_by = "Piotr"
last_review_on = "30-01-2026"

[composite_config]

Expand Down Expand Up @@ -28,15 +34,22 @@ numerator = "endpoints"
denominator = "containers"
min_ratio = 0.5
status = "yellow"
message = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low endpoint ratio"
message = """
Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low endpoint to container ratio.
This roughly means that more than half of the containers have 0 endpoints (ports).
This is unusual, however possible.
"""

[[composite_config.checks]]
check_type = "ratio"
numerator = "ips"
denominator = "endpoints"
min_ratio = 0.5
status = "yellow"
message = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low IP ratio"
message = """
Entities: {containers} containers, {endpoints} endpoints, {ips} IPs - Low IP to endpoints ratio.
We expect that there should be more unique IP addresses assigned to this number of containers and endpoints.
"""

[messages]
green = "Entities: {containers} containers, {endpoints} endpoints, {ips} IPs (healthy ratios)"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/cpu_throttling.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
rule_type = "percentage"
display_name = "cpu_throttling"
description = "CPU throttling percentage"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[percentage_config]
numerator = "rox_sensor_process_cpu_nr_throttled"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/dedupe_cache_hit_rate.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
rule_type = "cache_hit_rate"
display_name = "dedupe_cache_hit_rate"
description = "Deduplication cache effectiveness"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[cache_config]
hits_metric = "rox_sensor_dedupe_cache_hits"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/file_descriptors.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
rule_type = "percentage"
display_name = "file_descriptors"
description = "File descriptor utilization"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[percentage_config]
numerator = "process_open_fds"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/go_goroutines.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "go_goroutines"
display_name = "go_goroutines"
description = "Number of goroutines"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 10000
Expand Down
9 changes: 6 additions & 3 deletions automated-rules/go_memstats_heap_objects.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@ rule_type = "gauge_threshold"
metric_name = "go_memstats_heap_objects"
display_name = "go_memstats_heap_objects"
description = "Number of heap objects"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 1000000
high = 10000000
higher_is_worse = true

[messages]
green = "{value:.0f} heap objects (healthy)"
yellow = "{value:.0f} heap objects (elevated - monitor memory)"
red = "{value:.0f} heap objects (very high - significant memory use)"
green = "{value_human} heap objects (healthy)"
yellow = "{value_human} heap objects (elevated - monitor memory)"
red = "{value_human} heap objects (very high - significant memory use)"

[remediation]
red = "Very high heap object count. Check for memory leaks. Review object allocation patterns."
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/go_threads.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "go_threads"
display_name = "go_threads"
description = "Number of OS threads"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 100
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/heap_utilization.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
rule_type = "percentage"
display_name = "heap_utilization"
description = "Heap memory utilization"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[percentage_config]
numerator = "go_memstats_heap_alloc_bytes"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/http_incoming_in_flight_requests.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "http_incoming_in_flight_requests"
display_name = "http_incoming_in_flight_requests"
description = "HTTP in-flight requests"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "histogram"
metric_name = "http_incoming_request_duration_histogram_seconds"
display_name = "http_incoming_request_duration_histogram_seconds"
description = "HTTP request latency distribution"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[histogram_config]
unit = "seconds"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/load-level/cluster_volume.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@

rule_type = "load_detection"
display_name = "cluster_volume"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[[metrics]]
name = "containers"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "histogram"
metric_name = "rox_sensor_component_process_message_duration_seconds"
display_name = "rox_sensor_component_process_message_duration_seconds"
description = "Component process message duration"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[histogram_config]
unit = "seconds"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
rule_type = "gauge_threshold"
metric_name = "rox_sensor_deployment_enhancement_queue_size"
display_name = "rox_sensor_deployment_enhancement_queue_size"
description = "Deployment enhancement queue size"
description = """
Number of deployments waiting for enhancement.
"""

reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 0
low = 1
high = 2
higher_is_worse = true

Expand All @@ -17,5 +23,5 @@ red = "{value:.0f} items queued (backed up - processing issues)"
red = "Deployment enhancement queue backed up. Check processing capacity and bottlenecks."
yellow = "Monitor queue size. Investigate if backlog persists."

acs_versions = ["4.7+", "4.8+", "4.9+"]
acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"]

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "queue_operations"
metric_name = "rox_sensor_detector_deployment_queue_operations_total"
display_name = "rox_sensor_detector_deployment_queue_operations_total"
description = "Deployment queue Add/Remove balance"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[queue_config]
operation_label = "Operation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "queue_operations"
metric_name = "rox_sensor_detector_network_flow_queue_operations_total"
display_name = "rox_sensor_detector_network_flow_queue_operations_total"
description = "Network flow queue Add/Remove balance"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[queue_config]
operation_label = "Operation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "queue_operations"
metric_name = "rox_sensor_detector_process_indicator_queue_operations_total"
display_name = "rox_sensor_detector_process_indicator_queue_operations_total"
description = "Process indicator queue Add/Remove balance"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[queue_config]
operation_label = "Operation"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
rule_type = "histogram"
metric_name = "rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds"
display_name = "rox_sensor_enricher_image_scan_internal_exponential_backoff_seconds"
description = "Image scan exponential backoff duration"
description = "Image scan exponential backoff duration. High value means the scanner is overloaded and image scans are being delayed."
reviewed = "Yes, human-reviewed"
last_review_by = "Piotr"
last_review_on = "2026-01-30"

[histogram_config]
unit = "seconds"
Expand All @@ -12,12 +15,21 @@ p95_warn = 4.0

[messages]
green = "p95={p95:.3f}s, p99={p99:.3f}s (good)"
yellow = "p95={p95:.3f}s, p99={p99:.3f}s (elevated)"
red = "p95={p95:.3f}s, p99={p99:.3f}s (high backoff - scanner issues)"
yellow = """
p95={p95:.3f}s, p99={p99:.3f}s (elevated)
High value = the system is spending a lot of time waiting before it can scan images, not actually doing work.
That waiting happens only when the scanner is too busy or rate‑limited, so a high number means requests are piling up and scans are delayed.
This delay applies to each scan request being sent to scanner.
"""
red = """
p95={p95:.3f}s, p99={p99:.3f}s (high backoff - scanner issues).
High value = the system is spending a lot of time waiting before it can scan images, not actually doing work.
That waiting happens only when the scanner is too busy or rate‑limited, so a high number means requests are piling up and scans are delayed.
This delay applies to each scan request being sent to scanner.
"""

[remediation]
red = "High image scan backoff duration. Check scanner connectivity and performance."
yellow = "Monitor image scan backoff trends."

acs_versions = ["4.7+", "4.8+", "4.9+"]
red = "High image scan backoff duration. Check scanner connectivity and performance. Also check the `rox_sensor_scan_call_duration_milliseconds` metric."
yellow = "Monitor image scan backoff trends. Also check the `rox_sensor_scan_call_duration_milliseconds` metric."

acs_versions = ["4.7+", "4.8+", "4.9+", "4.10+"]
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "histogram"
metric_name = "rox_sensor_k8s_event_ingestion_to_send_duration"
display_name = "rox_sensor_k8s_event_ingestion_to_send_duration"
description = "K8s event ingestion to send duration"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[histogram_config]
unit = "seconds"
Expand Down
17 changes: 13 additions & 4 deletions automated-rules/rox_sensor_k8s_event_processing_duration.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
rule_type = "histogram"
metric_name = "rox_sensor_k8s_event_processing_duration"
display_name = "rox_sensor_k8s_event_processing_duration"
description = "K8s event processing duration"
description = """
K8s event processing duration in milliseconds.
It measures how long the listener’s dispatcher takes to process a single Kubernetes event,
from right before ProcessEvent starts until right after it finishes and the metric is recorded.
It does not include resolver, detector, or sending to Central.
"""

reviewed = "Partially, by a human"
last_review_by = "Piotr"
last_review_on = "30-01-2026"

[histogram_config]
unit = "seconds"
unit = "ms"

[thresholds]
p95_good = 0.1
p95_warn = 1.0
p95_good = 256
p95_warn = 512

[messages]
green = "p95={p95:.3f}s, p99={p99:.3f}s (good)"
Expand Down
6 changes: 6 additions & 0 deletions automated-rules/rox_sensor_network_flow_buffer_size.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "rox_sensor_network_flow_buffer_size"
display_name = "rox_sensor_network_flow_buffer_size"
description = "Network flow buffer size monitoring"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 100
Expand All @@ -11,14 +14,17 @@ higher_is_worse = true
[load_level_thresholds.low]
low = 100
high = 150
higher_is_worse = true

[load_level_thresholds.medium]
low = 150
high = 250
higher_is_worse = true

[load_level_thresholds.high]
low = 200
high = 300
higher_is_worse = true

[messages]
green = "Buffer size: {value} (default: 100, max: 300)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "histogram"
metric_name = "rox_sensor_network_flow_manager_purger_duration_seconds"
display_name = "rox_sensor_network_flow_manager_purger_duration_seconds"
description = "Network flow manager purger duration"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[histogram_config]
unit = "seconds"
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/rox_sensor_num_pods_in_store.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "rox_sensor_num_pods_in_store"
display_name = "rox_sensor_num_pods_in_store"
description = "Number of pods tracked by Sensor"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 0
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/rox_sensor_output_channel_size.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "rox_sensor_output_channel_size"
display_name = "rox_sensor_output_channel_size"
description = "Output channel size (messages to Central)"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 50
Expand Down
3 changes: 3 additions & 0 deletions automated-rules/rox_sensor_process_signal_buffer_size.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ rule_type = "gauge_threshold"
metric_name = "rox_sensor_process_signal_buffer_size"
display_name = "rox_sensor_process_signal_buffer_size"
description = "Process signal buffer size"
reviewed = "No, AI-generated"
last_review_by = ""
last_review_on = "never"

[thresholds]
low = 100
Expand Down
Loading
Loading