Skip to content

Commit

Permalink
DEV-46410 Do not send a notification on error or no data state (#41)
Browse files Browse the repository at this point in the history
**What is this feature?**
Added the following changes:
* Do not send a notification on error or no data state.
* Change default ExecErrState to OK
* Enforce OK state of ExecErrState no matter what value is sent in API calls


**Why do we need this feature?**
What currently is happening in “vanilla” 10:
* Alerts with No Data or Error state will send notification to customer on that state
* User can choose on alert what state to give given NoData/Error - Alerting/NoData/Error/OK/KeepLastState …

**Problems:**
* There is no “mute” time for notifications, so if there is No Data or Error states for an alert, it will continue to notify until issue is resolve, spamming user with a lot of notifications, and cannot be configured by user.
* We don’t want customer to get Error state on alerts, since it is internal issues.

**What we want:**
* Don't send notification on No Data or Error
* Set default error handling state to OK and enforce OK state.
  • Loading branch information
yasmin-tr authored Sep 22, 2024
1 parent 63db252 commit da350c4
Show file tree
Hide file tree
Showing 12 changed files with 103 additions and 35 deletions.
26 changes: 14 additions & 12 deletions pkg/services/ngalert/api/api_ruler_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,20 @@ func validateRuleNode(
}
}

errorState := ngmodels.AlertingErrState

if ruleNode.GrafanaManagedAlert.ExecErrState == "" && canPatch {
errorState = ""
}

if ruleNode.GrafanaManagedAlert.ExecErrState != "" {
errorState, err = ngmodels.ErrStateFromString(string(ruleNode.GrafanaManagedAlert.ExecErrState))
if err != nil {
return nil, err
}
}
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
errorState := ngmodels.OkErrState

//if ruleNode.GrafanaManagedAlert.ExecErrState == "" && canPatch {
// errorState = ""
//}
//
//if ruleNode.GrafanaManagedAlert.ExecErrState != "" {
// if err != nil {
// return nil, err
// }
// errorState, err = ngmodels.ErrStateFromString(string(ruleNode.GrafanaManagedAlert.ExecErrState))
//}
// LOGZ.IO GRAFANA CHANGE :: End

if len(ruleNode.GrafanaManagedAlert.Data) == 0 {
if canPatch {
Expand Down
2 changes: 1 addition & 1 deletion pkg/services/ngalert/migration/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,7 @@ func TestDashAlertQueryMigration(t *testing.T) {
return a.ID < b.ID
}),
cmpopts.IgnoreUnexported(ngModels.AlertRule{}, ngModels.AlertQuery{}),
cmpopts.IgnoreFields(ngModels.AlertRule{}, "Updated", "UID", "ID"),
cmpopts.IgnoreFields(ngModels.AlertRule{}, "Updated", "UID", "ID", "ExecErrState"), // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
}
if tt.expectedFolder != nil {
cOpt = append(cOpt, cmpopts.IgnoreFields(ngModels.AlertRule{}, "NamespaceUID"))
Expand Down
2 changes: 1 addition & 1 deletion pkg/services/ngalert/migration/permissions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ func TestDashAlertPermissionMigration(t *testing.T) {
return a.Permission < b.Permission
}),
cmpopts.IgnoreUnexported(ngModels.AlertRule{}, ngModels.AlertQuery{}),
cmpopts.IgnoreFields(ngModels.AlertRule{}, "ID", "Updated", "UID"),
cmpopts.IgnoreFields(ngModels.AlertRule{}, "ID", "Updated", "UID", "ExecErrState"), // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
cmpopts.IgnoreFields(dashboards.Dashboard{}, "ID", "Created", "Updated", "Data", "Slug"),
}
if !cmp.Equal(tt.expected, actual, cOpt...) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/services/ngalert/migration/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1617,7 +1617,7 @@ func compareRules(t *testing.T, x *xorm.Engine, orgId int64, expectedRules []*mo
return a.Title < b.Title
}),
cmpopts.IgnoreUnexported(models.AlertRule{}, models.AlertQuery{}),
cmpopts.IgnoreFields(models.AlertRule{}, "Updated", "UID", "ID", "Version"),
cmpopts.IgnoreFields(models.AlertRule{}, "Updated", "UID", "ID", "Version", "ExecErrState"), // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
}
if !cmp.Equal(expectedRules, rules, cOpt...) {
t.Errorf("Unexpected Rule: %v", cmp.Diff(expectedRules, rules, cOpt...))
Expand Down
2 changes: 2 additions & 0 deletions pkg/services/ngalert/provisioning/alert_rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ func (service *AlertRuleService) CreateAlertRule(ctx context.Context, rule model
}
}
}
rule.ExecErrState = models.OkErrState // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
err = service.xact.InTransaction(ctx, func(ctx context.Context) error {
ids, err := service.ruleStore.InsertAlertRules(ctx, []models.AlertRule{
rule,
Expand Down Expand Up @@ -411,6 +412,7 @@ func (service *AlertRuleService) UpdateAlertRule(ctx context.Context, rule model
rule.Updated = time.Now()
rule.ID = storedRule.ID
rule.IntervalSeconds = storedRule.IntervalSeconds
rule.ExecErrState = models.OkErrState // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
err = rule.SetDashboardAndPanelFromAnnotations()
if err != nil {
return models.AlertRule{}, err
Expand Down
24 changes: 24 additions & 0 deletions pkg/services/ngalert/provisioning/alert_rules_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ func TestAlertRuleService(t *testing.T) {
require.NoError(t, err)
})

// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
t.Run("update alert rule ExecErrState enforces OK state", func(t *testing.T) {
rule := dummyRule("test-update-errstate", orgID)
rule, err := ruleService.CreateAlertRule(context.Background(), rule, models.ProvenanceNone, 0)
require.NoError(t, err)
require.Equal(t, models.OkErrState, rule.ExecErrState)

rule.ExecErrState = models.ErrorErrState
rule, err = ruleService.UpdateAlertRule(context.Background(), rule, models.ProvenanceNone)
require.NoError(t, err)
require.Equal(t, models.OkErrState, rule.ExecErrState)
})
// LOGZ.IO GRAFANA CHANGE :: End

t.Run("group creation should propagate group title correctly", func(t *testing.T) {
group := createDummyGroup("group-test-3", orgID)
group.Rules[0].RuleGroup = "something different"
Expand Down Expand Up @@ -556,6 +570,16 @@ func TestCreateAlertRule(t *testing.T) {
require.NoError(t, err)
})
})

// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
t.Run("should create rule with ExecErrState OK state enforced", func(t *testing.T) {
rule := dummyRule("test-create-errstate", orgID)
rule.ExecErrState = models.AlertingErrState
rule, err := ruleService.CreateAlertRule(context.Background(), rule, models.ProvenanceNone, 0)
require.NoError(t, err)
require.Equal(t, models.OkErrState, rule.ExecErrState)
})
// LOGZ.IO GRAFANA CHANGE :: End
}

func createAlertRuleService(t *testing.T) AlertRuleService {
Expand Down
19 changes: 13 additions & 6 deletions pkg/services/ngalert/schedule/schedule_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -752,13 +752,20 @@ func TestSchedule_ruleRoutine(t *testing.T) {
require.NoError(t, err)
})

t.Run("it should send special alert DatasourceError", func(t *testing.T) {
sender.AssertNumberOfCalls(t, "Send", 1)
args, ok := sender.Calls()[0].Arguments[2].(definitions.PostableAlerts)
require.Truef(t, ok, fmt.Sprintf("expected argument of function was supposed to be 'definitions.PostableAlerts' but got %T", sender.Calls()[0].Arguments[2]))
assert.Len(t, args.PostableAlerts, 1)
assert.Equal(t, state.ErrorAlertName, args.PostableAlerts[0].Labels[prometheusModel.AlertNameLabel])
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Do not send a notification on error or no data state
//t.Run("it should send special alert DatasourceError", func(t *testing.T) {
// sender.AssertNumberOfCalls(t, "Send", 1)
// args, ok := sender.Calls()[0].Arguments[2].(definitions.PostableAlerts)
// require.Truef(t, ok, fmt.Sprintf("expected argument of function was supposed to be 'definitions.PostableAlerts' but got %T", sender.Calls()[0].Arguments[2]))
// assert.Len(t, args.PostableAlerts, 1)
// assert.Equal(t, state.ErrorAlertName, args.PostableAlerts[0].Labels[prometheusModel.AlertNameLabel])
//})
t.Run("it should not send special alert DatasourceError", func(t *testing.T) {
sender.AssertNotCalled(t, "Send", mock.Anything, mock.Anything)

require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
})
// LOGZ.IO GRAFANA CHANGE :: End
})

t.Run("when there are alerts that should be firing", func(t *testing.T) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/services/ngalert/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ func (a *State) NeedsSending(resendDelay time.Duration) bool {
case eval.Normal:
// We should send a notification if the state is Normal because it was resolved
return a.Resolved
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Do not send a notification on error or no data state
case eval.NoData, eval.Error:
return false
// LOGZ.IO GRAFANA CHANGE :: end
default:
// We should send, and re-send notifications, each time LastSentAt is <= LastEvaluationTime + resendDelay
nextSent := a.LastSentAt.Add(resendDelay)
Expand Down
12 changes: 8 additions & 4 deletions pkg/services/ngalert/state/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,10 @@ func TestNeedsSending(t *testing.T) {
},
},
{
name: "state: no-data, needs to be re-sent",
expected: true,
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Do not send a notification on error or no data state
name: "state: no-data, should not be re-sent with evaluation time ",
expected: false,
// LOGZ.IO GRAFANA CHANGE :: End
resendDelay: 1 * time.Minute,
testState: &State{
State: eval.NoData,
Expand All @@ -446,8 +448,10 @@ func TestNeedsSending(t *testing.T) {
},
},
{
name: "state: error, needs to be re-sent",
expected: true,
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Do not send a notification on error or no data state
name: "state: error, should not be re-sent with evaluation time",
expected: false,
// LOGZ.IO GRAFANA CHANGE :: End
resendDelay: 1 * time.Minute,
testState: &State{
State: eval.Error,
Expand Down
2 changes: 2 additions & 0 deletions pkg/services/ngalert/store/alert_rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ func (st DBstore) InsertAlertRules(ctx context.Context, rules []ngmodels.AlertRu
r.UID = uid
}
r.Version = 1
r.ExecErrState = ngmodels.OkErrState // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
if err := st.validateAlertRule(r); err != nil {
return err
}
Expand Down Expand Up @@ -203,6 +204,7 @@ func (st DBstore) UpdateAlertRules(ctx context.Context, rules []ngmodels.UpdateR
var parentVersion int64
r.New.ID = r.Existing.ID
r.New.Version = r.Existing.Version // xorm will take care of increasing it (see https://xorm.io/docs/chapter-06/1.lock/)
r.New.ExecErrState = ngmodels.OkErrState // LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK and enforce OK value
if err := st.validateAlertRule(r.New); err != nil {
return err
}
Expand Down
20 changes: 13 additions & 7 deletions pkg/tests/api/alerting/api_alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,7 @@ func TestIntegrationDeleteFolderWithRules(t *testing.T) {
re = regexp.MustCompile(`"updated":"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"`)
b = re.ReplaceAll(b, []byte(`"updated":"2021-05-19T19:47:55Z"`))

// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
expectedGetRulesResponseBody := fmt.Sprintf(`{
"default": [
{
Expand Down Expand Up @@ -776,7 +777,7 @@ func TestIntegrationDeleteFolderWithRules(t *testing.T) {
"namespace_uid": %q,
"rule_group": "arulegroup",
"no_data_state": "NoData",
"exec_err_state": "Alerting"
"exec_err_state": "OK"
}
}
]
Expand Down Expand Up @@ -1207,6 +1208,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
// copy result to a variable with a wider scope
// to be used by the next test
ruleUID = generatedUIDs[0]
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
expectedGetNamespaceResponseBody = `
{
"default":[
Expand Down Expand Up @@ -1253,7 +1255,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"NoData",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
},
{
Expand Down Expand Up @@ -1289,7 +1291,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"Alerting",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
}
]
Expand Down Expand Up @@ -1549,6 +1551,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
assert.True(t, ok)
assert.Equal(t, 1, len(returnedUIDs))
assert.Equal(t, ruleUID, returnedUIDs[0])
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
assert.JSONEq(t, `
{
"default":[
Expand Down Expand Up @@ -1597,7 +1600,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"Alerting",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
}
]
Expand Down Expand Up @@ -1666,6 +1669,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
assert.True(t, ok)
assert.Equal(t, 1, len(returnedUIDs))
assert.Equal(t, ruleUID, returnedUIDs[0])
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
assert.JSONEq(t, `
{
"default":[
Expand Down Expand Up @@ -1706,7 +1710,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"Alerting",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
}
]
Expand Down Expand Up @@ -1754,6 +1758,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
assert.True(t, ok)
assert.Equal(t, 1, len(returnedUIDs))
assert.Equal(t, ruleUID, returnedUIDs[0])
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
assert.JSONEq(t, `
{
"default":[
Expand Down Expand Up @@ -1794,7 +1799,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"Alerting",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
}
]
Expand Down Expand Up @@ -2209,6 +2214,7 @@ func TestIntegrationQuota(t *testing.T) {
assert.True(t, ok)
assert.Equal(t, 1, len(returnedUIDs))
assert.Equal(t, ruleUID, returnedUIDs[0])
// LOGZ.IO GRAFANA CHANGE :: DEV-46410 - Change default ExecErrState to OK
assert.JSONEq(t, `
{
"default":[
Expand Down Expand Up @@ -2249,7 +2255,7 @@ func TestIntegrationQuota(t *testing.T) {
"namespace_uid":"nsuid",
"rule_group":"arulegroup",
"no_data_state":"NoData",
"exec_err_state":"Alerting"
"exec_err_state":"OK"
}
}
]
Expand Down
Loading

0 comments on commit da350c4

Please sign in to comment.