Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 282 additions & 0 deletions monitor/alert-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
serverFiles:
alerting_rules.yml:
groups:
- name: broker
rules:
- alert: BrokerPodNotHealthy
annotations:
summary: Kubernetes Pod not healthy.
expr: kube_pod_status_ready{condition="true", pod=~".*-broker-\\d+"} == 0
for: 10m
labels:
severity: error
- alert: BrokerContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(broker)-.*"}[15m]) == 1
for: 0m
labels:
severity: warning
- alert: BrokerContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 2) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(broker)-.*"}[15m]) == 1
for: 0m
labels:
severity: error
- alert: BrokerHighPublishLatency
annotations:
summary: Pulsar Broker P99 publish latency is over 1 second.
expr: sum(pulsar_broker_publish_latency{quantile="0.99"}) by (pod, cloud_streamnative_io_cluster) / 1000 > 1
for: 15m
labels:
severity: error
- alert: BrokerHighPublishLatency
annotations:
summary: Pulsar Broker P99 publish latency is over 1 second.
expr: sum(pulsar_broker_publish_latency{quantile="0.99"}) by (pod, cloud_streamnative_io_cluster) / 1000 > 1
for: 5m
labels:
severity: warning
Comment on lines +34 to +40

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove this one which is duplicated with the above one. user side should determine how long the pending time is.

- alert: SizeableCompactedLedger
annotations:
summary: There is a topic that has a sizeable compacted ledger with more than 10 million entries (remove entries_count from the topic name in Grafana)
expr: sum(pulsar_compaction_compacted_entries_count{}) by (pod, cloud_streamnative_io_cluster) > 10000000
for: 5m
labels:
severity: error
Comment on lines +41 to +47

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove this alert rule, it's a common scenario to be monitored.

- alert: TooManyReplicationBacklogs
annotations:
summary: There is a topic that has too many replication backlogs.
expr: max(pulsar_replication_backlog{}) by (cluster,topic,namespace,pod) > 50000
for: 5m
labels:
severity: error
- alert: BrokerHighTopicLoadPendingRequests
annotations:
summary: High number of Topic is pending in Loading requests
expr: sum(pulsar_broker_topic_load_pending_requests{}) by (pod, cloud_streamnative_io_cluster) > 900
for: 10m
labels:
severity: error
- alert: BrokerHighLookUpPendingRequests
annotations:
summary: High number of Topic is pending in Lookup requests
expr: sum(pulsar_broker_lookup_pending_requests{}) by (pod, cloud_streamnative_io_cluster) > 200000
for: 10m
labels:
severity: error
- alert: BrokerHighAuthFailures
annotations:
summary: Too many authentication failures
expr: sum by (pod,cluster)(increase(pulsar_authentication_failures_count{}[1m])) > 100
for: 5m
labels:
severity: warning
- alert: PulsarTooManyBacklogs
annotations:
summary: Too many backlogs
expr: sum(pulsar_msg_backlog{}) by (pod, cloud_streamnative_io_cluster) > 10000
for: 10m
labels:
severity: error
- alert: PulsarTopicLoadLatencyP99
annotations:
summary: Topic Load Latency P99 too high
expr: sum by(cluster,pod)(topic_load_times{quantile="0.99"}) / 1000 > 30
for: 5m
labels:
severity: error
- alert: PulsarTopicLoadFailed
annotations:
summary: Topic failed to Load too much times
expr: sum by(pod, cloud_streamnative_io_cluster)(delta(topic_load_failed_total[30m])) > 30
for: 5m
labels:
severity: error
- name: bookie
rules:
- alert: BookieContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 2) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(bookie|bk)-.*"}[15m]) == 1
for: 0m
labels:
severity: error
- alert: BookieContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(bookie|bk)-.*"}[15m]) == 1
for: 0m
labels:
severity: warning
- alert: BookiePodNotHealthy
annotations:
summary: Kubernetes Pod not healthy.
expr: kube_pod_status_ready{condition="true", pod =~".*-(bookie|bk)-.*"} == 0 AND ON (namespace,pod) kube_pod_info{created_by_kind!~"(Job|<none>)"}
for: 10m
labels:
severity: error
- alert: BookiePodNotHealthy
annotations:
summary: Kubernetes Pod not healthy.
expr: sum by(mon_poolmember,namespace,pod,job) (label_replace(kube_job_labels{},"pod","$1","job_name","(.*)-decommission") == 1) AND ON (namespace,pod) kube_pod_status_ready{condition="true", pod =~".*-(bookie|bk)-.*"} == 0 AND ON (namespace,pod) kube_pod_info{created_by_kind!~"(Job|<none>)"}
for: 2m
labels:
severity: info
Comment on lines +120 to +126

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove this one.

- alert: BookieDecommissionJobCompletion
annotations:
summary: Job did not complete in time
expr: sum (kube_job_spec_completions{job_name=~".*-decommission"} - kube_job_status_succeeded{job="kube-state-metrics", job_name=~".*-decommission"}) by (mon_poolmember,job,job_name,namespace) >0
for: 1h
labels:
severity: error
- alert: BookiePVCUsage
annotations:
summary: High PVC usage.
expr: 100 * (1 - (kubelet_volume_stats_available_bytes{persistentvolumeclaim =~".*-(bk|bookie)-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim =~".*-(bk|bookie)-.*"})) > 85
for: 5m
labels:
severity: warning
- alert: BookieLedgerFillingUp
annotations:
summary: PulsarCluster bookie ledger to be fill up.
expr: predict_linear(pulsarcluster:bookie_ledger_usage:sum[6h], 8 * 3600) > 95
for: 0m
labels:
severity: warning
- alert: BookieTotalLedgerUsage
annotations:
summary: Total PVC usage of bookie ledger is high.
expr: sum by (pod, cloud_streamnative_io_cluster)(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_0_usage) / count by (pod, cloud_streamnative_io_cluster)(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_0_usage) > 85
labels:
priority: P2
severity: error
- alert: BookieTotalLedgerUsage
annotations:
summary: At least 2 bookies ledger space usage should be less than 75%.
expr: (count by (pod, cloud_streamnative_io_cluster)(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_0_usage) - count by (pod, cloud_streamnative_io_cluster)(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_0_usage > 75)) < 2
for: 5m
labels:
priority: P3
severity: error
- alert: BookieHighEntryAddLatency
annotations:
summary: Bookie Entry add P99 latency is over than 500ms. Possible bottleneck in IO or BW of the disk.
expr: bookkeeper_server_ADD_ENTRY_REQUEST{quantile="0.99", success="true"} > 500
for: 5m
labels:
priority: P3
severity: error
- alert: BookieHighEntryReadLatency
annotations:
summary: Bookie Entry read P99 latency is over than 10s.
expr: avg_over_time(bookkeeper_server_READ_ENTRY_REQUEST{quantile="0.99", success="true"}[10m]) /1000 > 10

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for: 5m
labels:
priority: P3
severity: error
- alert: BookieInstanceWritable
annotations:
summary: Less than 3 Writable Bookie instances.
expr: count by(pod, cloud_streamnative_io_cluster)(bookie_SERVER_STATUS == 1) < 3
for: 2m
labels:
priority: P1
severity: critical
- alert: BookieInstanceReadonly
annotations:
summary: Bookie instance in readonly status.
expr: bookie_SERVER_STATUS == 0
for: 5m
labels:
severity: warning
- alert: BookieNotEnoughWritableInstance
annotations:
summary: Writable bookies count is less than 3.
expr: (count by (pod, cloud_streamnative_io_cluster)(bookie_SERVER_STATUS) - count by (pod, cloud_streamnative_io_cluster)(bookie_SERVER_STATUS != 1)) < 3
for: 5m
labels:
priority: P2
severity: error
- name: zookeeper
rules:
- alert: ZKContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(zookeeper|zk)-.*"}[15m]) == 1
for: 0m
labels:
severity: critical
- alert: ZKPodNotHealthy
annotations:
summary: Kubernetes Pod not healthy.
expr: kube_pod_status_ready{condition="true", pod =~".*-(zookeeper|zk)-.*"} == 0 AND ON (pod) kube_pod_info{created_by_kind!~"(Job|<none>)"}
for: 10m
labels:
severity: critical
- alert: ZKHighWatchers
annotations:
summary: Watchers of Zookeeper server is over than 1200k.
expr: znode_count{} > 1200000
for: 5m
labels:
severity: warning
- alert: ZKLeaderLost
annotations:
summary: Zookeeper cluster leader lost.
expr: count by(cloud_streamnative_io_cluster,pod)(leader_uptime) !=1
for: 5m
labels:
severity: critical
- alert: ZKDataInconsistent
annotations:
summary: znode count inconsistent.
expr: sum by (cloud_streamnative_io_cluster,po)(increase(diff_count[1m])) > 10
for: 2m
labels:
severity: error
- alert: ZKReadLatency
annotations:
summary: ZK P99 ReadLatency too high.
expr: readlatency{quantile="0.99"} > 500
for: 5m
labels:
severity: warning
- alert: ZKUpdateLatency
annotations:
summary: ZK P99 UpdateLatency too high.
expr: updatelatency{quantile="0.99"} > 1000
for: 5m
labels:
severity: warning
- alert: ZKPVCUsage
annotations:
summary: High PVC usage.
expr: 100 * (1 - (kubelet_volume_stats_available_bytes{persistentvolumeclaim =~".*-(zk|zookeeper)-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim =~".*-(zk|zookeeper)-.*"})) > 85
for: 5m
labels:
severity: error
- alert: ZKUnrecoverableError
annotations:
summary: ZK in an unexpected state related to incident 846 or 849. Needs restart of ZK. Dashboard needs correction
expr: unrecoverable_error_count > 0
for: 5m
labels:
severity: critical
- name: proxy
rules:
- alert: ProxyHighActiveConnections
annotations:
summary: The proxies are close to reach the maximum amount of connections.
expr: sum(pulsar_proxy_active_connections{}) by (pod, cloud_streamnative_io_cluster)>60000
for: 10m
labels:
severity: error
- alert: ProxyContainerOOMKilled
annotations:
summary: Kubernetes container oom killed.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 15m >= 2) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", pod =~".*-(proxy)-.*"}[15m]) == 1
for: 0m
labels:
severity: error