Skip to content

Commit c440bd2

Browse files
authored
Alerting: Change default for max_attempts to 3. (grafana#97461)
Currently the default is 1, this means that by default users will see transient query errors reflected as alert evaluation failures, when often an immediate retry is sufficient to evaluate the rule successfully. Enabling retries by default leads to a better experience out of the box.
1 parent 6a1685a commit c440bd2

File tree

4 files changed

+9
-9
lines changed

4 files changed

+9
-9
lines changed

conf/defaults.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,8 +1338,8 @@ execute_alerts = true
13381338
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
13391339
evaluation_timeout = 30s
13401340

1341-
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
1342-
max_attempts = 1
1341+
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3.
1342+
max_attempts = 3
13431343

13441344
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time.
13451345
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

conf/sample.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,8 +1322,8 @@
13221322
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
13231323
;evaluation_timeout = 30s
13241324

1325-
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
1326-
;max_attempts = 1
1325+
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3.
1326+
;max_attempts = 3
13271327

13281328
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time.
13291329
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

pkg/setting/setting_unified_alerting.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ const (
4949
evaluatorDefaultEvaluationTimeout = 30 * time.Second
5050
schedulerDefaultAdminConfigPollInterval = time.Minute
5151
schedulerDefaultExecuteAlerts = true
52-
schedulerDefaultMaxAttempts = 1
52+
schedulerDefaultMaxAttempts = 3
5353
schedulerDefaultLegacyMinInterval = 1
5454
screenshotsDefaultCapture = false
5555
screenshotsDefaultCaptureTimeout = 10 * time.Second

pkg/setting/setting_unified_alerting_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
120120
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
121121
},
122122
alertingOptions: map[string]string{
123-
"max_attempts": "1",
123+
"max_attempts": "1", // Note: Ignored, setting does not exist.
124124
"min_interval_seconds": "120",
125125
"execute_alerts": "true",
126126
"evaluation_timeout_seconds": "160",
127127
},
128128
verifyCfg: func(t *testing.T, cfg Cfg) {
129129
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
130-
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
130+
require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts)
131131
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
132132
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
133133
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
@@ -168,14 +168,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
168168
"evaluation_timeout": "invalid",
169169
},
170170
alertingOptions: map[string]string{
171-
"max_attempts": "1",
171+
"max_attempts": "1", // Note: Ignored, setting does not exist.
172172
"min_interval_seconds": "120",
173173
"execute_alerts": "false",
174174
"evaluation_timeout_seconds": "160",
175175
},
176176
verifyCfg: func(t *testing.T, cfg Cfg) {
177177
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
178-
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
178+
require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts)
179179
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
180180
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
181181
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)

0 commit comments

Comments
 (0)