Skip to content

Commit 62d3d55

Browse files
author
Sergey Tolmachev
committed
Merge remote-tracking branch 'origin/control_unknown_untouched_decision' into 0.8.0-30.bosun
2 parents c683501 + b37763b commit 62d3d55

File tree

8 files changed

+621
-553
lines changed

8 files changed

+621
-553
lines changed

cmd/bosun/conf/conf.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ type SystemConfProvider interface {
7878

7979
GetMaxRenderedTemplateAge() int
8080

81+
GetProblemRunsToUnknown() time.Duration
82+
8183
GetExampleExpression() string
8284

8385
// Contexts

cmd/bosun/conf/system.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type SystemConf struct {
4646
MinGroupSize int
4747

4848
UnknownThreshold int
49+
ProblemRunsToUnknown int
4950
CheckFrequency Duration // Time between alert checks: 5m
5051
DefaultRunEvery int // Default number of check intervals to run each alert: 1
5152
AlertCheckDistribution string // Method to distribute alet checks. No distribution if equals ""
@@ -818,6 +819,15 @@ func (sc *SystemConf) GetAzureMonitorContext() expr.AzureMonitorClients {
818819
return allClients
819820
}
820821

822+
// how many runs can by skipped because the errors of some stuck within the bosun
823+
// before we assume the alert should go to the unknown state
824+
func (sc *SystemConf) GetProblemRunsToUnknown() time.Duration {
825+
if sc.ProblemRunsToUnknown <= 0 {
826+
sc.ProblemRunsToUnknown = 1
827+
}
828+
return time.Duration(1 + sc.ProblemRunsToUnknown)
829+
}
830+
821831
// azureLogRequest outputs HTTP requests to Azure to the logs
822832
func azureLogRequest() autorest.PrepareDecorator {
823833
return func(p autorest.Preparer) autorest.Preparer {

cmd/bosun/sched/check.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,11 @@ func (s *Schedule) findUnknownAlerts(now time.Time, alert string) []models.Alert
569569

570570
t := a.Unknown
571571
if t == 0 {
572-
t = s.SystemConf.GetCheckFrequency() * 2 * time.Duration(runEvery)
572+
runEvery := s.SystemConf.GetDefaultRunEvery()
573+
if a.RunEvery != 0 {
574+
runEvery = a.RunEvery
575+
}
576+
t = s.SystemConf.GetCheckFrequency() * time.Duration(runEvery) * s.SystemConf.GetProblemRunsToUnknown()
573577
}
574578
maxTouched := now.UTC().Unix() - int64(t.Seconds())
575579
untouched, err := s.DataAccess.State().GetUntouchedSince(alert, maxTouched)

cmd/bosun/sched/check_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,9 @@ Loop:
458458
}
459459
}
460460

461+
func TestCheckNotifyDelayedUnknown(t *testing.T) {
462+
}
463+
461464
// TestCheckNotifyUnknownDefault tests the default unknownTemplate.
462465
func TestCheckNotifyUnknownDefault(t *testing.T) {
463466
defer setup()()

cmd/bosun/sched/depends_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
// Result should be {a=c} only.
1313
func TestDependency_Simple(t *testing.T) {
1414
defer setup()()
15-
testSched(t, &schedTest{
15+
testSched(t, nil, &schedTest{
1616
conf: `alert a {
1717
crit = avg(q("avg:c{a=*}", "5m", "")) > 0
1818
depends = avg(q("avg:d{a=*}", "5m", "")) > 0
@@ -52,7 +52,7 @@ func TestDependency_Simple(t *testing.T) {
5252
// Crit and depends don't have same tag sets.
5353
func TestDependency_Overlap(t *testing.T) {
5454
defer setup()()
55-
testSched(t, &schedTest{
55+
testSched(t, nil, &schedTest{
5656
conf: `alert a {
5757
crit = avg(q("avg:c{a=*,b=*}", "5m", "")) > 0
5858
depends = avg(q("avg:d{a=*,d=*}", "5m", "")) > 0
@@ -91,7 +91,7 @@ func TestDependency_Overlap(t *testing.T) {
9191

9292
func TestDependency_OtherAlert(t *testing.T) {
9393
defer setup()()
94-
testSched(t, &schedTest{
94+
testSched(t, nil, &schedTest{
9595
conf: `alert a {
9696
crit = avg(q("avg:a{host=*,cpu=*}", "5m", "")) > 0
9797
}
@@ -134,7 +134,7 @@ func TestDependency_OtherAlert(t *testing.T) {
134134
func TestDependency_OtherAlert_Unknown(t *testing.T) {
135135
defer setup()()
136136

137-
testSched(t, &schedTest{
137+
testSched(t, nil, &schedTest{
138138
conf: `alert a {
139139
warn = avg(q("avg:a{host=*}", "5m", "")) > 0
140140
}
@@ -182,7 +182,7 @@ func TestDependency_OtherAlert_UnknownChain(t *testing.T) {
182182
bb := models.AlertKey("b{host=b}")
183183
cb := models.AlertKey("c{host=b}")
184184

185-
s := testSched(t, &schedTest{
185+
s := testSched(t, nil, &schedTest{
186186
conf: `
187187
alert a {
188188
warn = avg(q("avg:a{host=*}", "5m", "")) && 0
@@ -240,7 +240,7 @@ func TestDependency_OtherAlert_UnknownChain(t *testing.T) {
240240

241241
func TestDependency_Blocks_Unknown(t *testing.T) {
242242
defer setup()()
243-
testSched(t, &schedTest{
243+
testSched(t, nil, &schedTest{
244244
conf: `alert a {
245245
depends = avg(q("avg:b{host=*}", "5m", "")) > 0
246246
warn = avg(q("avg:a{host=*}", "5m", "")) > 0
@@ -267,7 +267,7 @@ func TestDependency_Blocks_Unknown(t *testing.T) {
267267
func TestDependency_AlertFunctionHasNoResults(t *testing.T) {
268268
defer setup()()
269269

270-
testSched(t, &schedTest{
270+
testSched(t, nil, &schedTest{
271271
conf: `
272272
alert a {
273273
warn = max(rename(q("sum:bosun.ping.timeout{dst_host=*,host=*}", "5m", ""), "host=source,dst_host=host"))

cmd/bosun/sched/sched_test.go

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ func initSched(sc conf.SystemConfProvider, c conf.RuleConfProvider, startTime ti
127127
return s, err
128128
}
129129

130-
func testSched(t *testing.T, st *schedTest, cluster cluster.Cluster) (s *Schedule) {
130+
func testSched(t *testing.T, sysConf *conf.SystemConf, st *schedTest, cluster cluster.Cluster) (s *Schedule) {
131131
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
132132
var req opentsdb.Request
133133
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -164,8 +164,12 @@ func testSched(t *testing.T, st *schedTest, cluster cluster.Cluster) (s *Schedul
164164
}
165165

166166
time.Sleep(time.Millisecond * 250)
167-
sysConf := &conf.SystemConf{CheckFrequency: conf.Duration{Duration: time.Minute * 5}, DefaultRunEvery: 1, UnknownThreshold: 5, MinGroupSize: 5, OpenTSDBConf: conf.OpenTSDBConf{Host: u.Host, ResponseLimit: 1 << 20}}
168-
s, _ = initSched(sysConf, c, time.Date(1900, 0, 0, 0, 0, 0, 0, time.UTC), cluster) //pretend we've been running for a while
167+
if sysConf == nil {
168+
sysConf = &conf.SystemConf{CheckFrequency: conf.Duration{Duration: time.Minute * 5}, DefaultRunEvery: 1, UnknownThreshold: 5, MinGroupSize: 5, OpenTSDBConf: conf.OpenTSDBConf{Host: u.Host, ResponseLimit: 1 << 20}}
169+
} else {
170+
sysConf.OpenTSDBConf.Host = u.Host
171+
}
172+
s, _ = initSched(sysConf, c, time.Date(1900, 0, 0, 0, 0, 0, 0, time.UTC), cluster)
169173
for ak, time := range st.touched {
170174
s.DataAccess.State().TouchAlertKey(ak, time)
171175
}
@@ -213,7 +217,7 @@ var window5Min = `"9.467277e+08", "9.46728e+08"`
213217

214218
func TestCrit(t *testing.T) {
215219
defer setup()()
216-
s := testSched(t, &schedTest{
220+
s := testSched(t, nil, &schedTest{
217221
conf: `alert a {
218222
crit = avg(q("avg:m{a=b}", "5m", "")) > 0
219223
}`,
@@ -237,7 +241,7 @@ func TestCrit(t *testing.T) {
237241

238242
func TestClusterEnabledFollover_AlertRun(t *testing.T) {
239243
defer setup()()
240-
s := testSched(t, &schedTest{
244+
s := testSched(t, nil, &schedTest{
241245
conf: `alert a {
242246
crit = avg(q("avg:m{a=b}", "5m", "")) > 0
243247
}`,
@@ -261,7 +265,7 @@ func TestClusterEnabledFollover_AlertRun(t *testing.T) {
261265

262266
func TestBandDisableUnjoined(t *testing.T) {
263267
defer setup()()
264-
testSched(t, &schedTest{
268+
testSched(t, nil, &schedTest{
265269
conf: `alert a {
266270
$sum = "sum:m{a=*}"
267271
$band = band($sum, "1m", "1h", 1)
@@ -288,7 +292,7 @@ func TestBandDisableUnjoined(t *testing.T) {
288292

289293
func TestCount(t *testing.T) {
290294
defer setup()()
291-
testSched(t, &schedTest{
295+
testSched(t, nil, &schedTest{
292296
conf: `alert a {
293297
crit = count("sum:m{a=*}", "5m", "") != 2
294298
}`,
@@ -311,7 +315,7 @@ func TestCount(t *testing.T) {
311315

312316
func TestUnknown(t *testing.T) {
313317
defer setup()()
314-
testSched(t, &schedTest{
318+
testSched(t, nil, &schedTest{
315319
conf: `alert a {
316320
crit = avg(q("avg:m{a=*}", "5m", "")) > 0
317321
}`,
@@ -328,9 +332,39 @@ func TestUnknown(t *testing.T) {
328332
}, nil)
329333
}
330334

335+
func TestDelayedUnknown(t *testing.T) {
336+
defer setup()()
337+
testSched(
338+
t,
339+
&conf.SystemConf{
340+
CheckFrequency: conf.Duration{Duration: time.Minute * 5},
341+
DefaultRunEvery: 1,
342+
UnknownThreshold: 5,
343+
MinGroupSize: 5,
344+
ProblemRunsToUnknown: 2,
345+
OpenTSDBConf: conf.OpenTSDBConf{ResponseLimit: 1 << 20},
346+
},
347+
&schedTest{
348+
conf: `alert a {
349+
crit = avg(q("avg:m{a=*}", "5m", "")) > 0
350+
}`,
351+
queries: map[string]opentsdb.ResponseSet{
352+
`q("avg:m{a=*}", ` + window5Min + `)`: {},
353+
},
354+
state: map[schedState]bool{
355+
{"a{a=d}", "unknown"}: true,
356+
},
357+
touched: map[models.AlertKey]time.Time{
358+
"a{a=b}": queryTime.Add(-10 * time.Minute),
359+
"a{a=c}": queryTime.Add(-12 * time.Minute),
360+
"a{a=d}": queryTime.Add(-15 * time.Minute),
361+
},
362+
}, nil)
363+
}
364+
331365
func TestUnknown_HalfFreq(t *testing.T) {
332366
defer setup()()
333-
testSched(t, &schedTest{
367+
testSched(t, nil, &schedTest{
334368
conf: `alert a {
335369
crit = avg(q("avg:m{a=*}", "5m", "")) > 0
336370
runEvery = 2
@@ -351,7 +385,7 @@ func TestUnknown_HalfFreq(t *testing.T) {
351385
func TestUnknown_WithError(t *testing.T) {
352386
defer setup()()
353387

354-
s := testSched(t, &schedTest{
388+
s := testSched(t, nil, &schedTest{
355389
conf: `alert a {
356390
crit = avg(q("avg:m{a=*}", "5m", "")) > 0
357391
}`,
@@ -371,7 +405,7 @@ func TestUnknown_WithError(t *testing.T) {
371405

372406
func TestRename(t *testing.T) {
373407
defer setup()()
374-
testSched(t, &schedTest{
408+
testSched(t, nil, &schedTest{
375409
conf: `
376410
alert ping.host {
377411
@@ -425,7 +459,7 @@ func TestRename(t *testing.T) {
425459

426460
func TestUnknownsAreNormal(t *testing.T) {
427461
defer setup()()
428-
testSched(t, &schedTest{
462+
testSched(t, nil, &schedTest{
429463
conf: `alert a {
430464
unknownIsNormal = true
431465
crit = avg(q("avg:m{a=*}", "5m", "")) > 0

0 commit comments

Comments
 (0)