Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[improve] update alarm inhibit rule and alarm ui #2957

Merged
merged 25 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public class AlerterProperties {
*/
private EntranceProperties entrance;

/**
* Inhibit configuration properties
*/
private InhibitProperties inhibit;

/**
* Data entry configuration properties
*/
Expand All @@ -105,4 +110,17 @@ public static class KafkaProperties extends BaseKafkaProperties {
}
}

/**
* Inhibit configuration properties
*/
@Getter
@Setter
public static class InhibitProperties {

/**
* inhibit rule cache ttl, default 4h
*/
private long ttl = 4 * 60 * 60 * 1000L;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import lombok.extern.slf4j.Slf4j;
import org.apache.hertzbeat.alert.AlerterProperties;
import org.apache.hertzbeat.alert.dao.AlertInhibitDao;
import org.apache.hertzbeat.common.constants.CommonConstants;
import org.apache.hertzbeat.common.entity.alerter.AlertInhibit;
import org.apache.hertzbeat.common.entity.alerter.GroupAlert;
import org.apache.hertzbeat.common.entity.alerter.SingleAlert;
import org.springframework.stereotype.Component;

import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.Collections;
import java.util.stream.Collectors;
import java.util.HashMap;

import lombok.Data;
import lombok.AllArgsConstructor;
Expand All @@ -44,30 +47,40 @@
@Component
@Slf4j
public class AlarmInhibitReduce {

/**
* Default TTL for source alerts (4 hours)
*/
private static final long SOURCE_ALERT_TTL = 4 * 60 * 60 * 1000L;


/**
* Interval for checking and cleaning up expired source alerts
*/
private static final long CHECK_INTERVAL = 60_000L;

/**
* alarm silence
*/
private final AlarmSilenceReduce alarmSilenceReduce;

/**
* rule cache
*/
private final Map<Long, AlertInhibit> inhibitRules;

/**
* Cache for source alerts
* key: ruleId
* value: map of source alerts with their fingerprints
*/
private final Map<Long, Map<String, SourceAlertEntry>> sourceAlertCache;

public AlarmInhibitReduce(AlarmSilenceReduce alarmSilenceReduce, AlertInhibitDao alertInhibitDao) {
/**
* Default TTL for source alerts (4 hours)
*/
private static long SOURCE_ALERT_TTL = 4 * 60 * 60 * 1000L;

public AlarmInhibitReduce(AlarmSilenceReduce alarmSilenceReduce, AlertInhibitDao alertInhibitDao
, AlerterProperties alerterProperties) {
this.alarmSilenceReduce = alarmSilenceReduce;
if (alerterProperties.getInhibit() != null && alerterProperties.getInhibit().getTtl() > 0) {
SOURCE_ALERT_TTL = alerterProperties.getInhibit().getTtl();
}
inhibitRules = new ConcurrentHashMap<>(8);
sourceAlertCache = new ConcurrentHashMap<>(8);
List<AlertInhibit> inhibits = alertInhibitDao.findAlertInhibitsByEnableIsTrue();
Expand All @@ -94,7 +107,7 @@ private void startScheduledCleanupCache() {
} catch (Exception e) {
log.error("Error during scheduled cleanup", e);
}
}, CHECK_INTERVAL, CHECK_INTERVAL, java.util.concurrent.TimeUnit.MILLISECONDS);
}, CHECK_INTERVAL, CHECK_INTERVAL, TimeUnit.MILLISECONDS);
}

/**
Expand Down Expand Up @@ -126,63 +139,67 @@ public void inhibitAlarm(GroupAlert groupAlert) {
return;
}

for (AlertInhibit rule : inhibitRules.values()) {
if (isSourceAlert(groupAlert, rule)) {
cacheSourceAlert(groupAlert, rule);
// Process each individual alert
for (var alert : groupAlert.getAlerts()) {
for (AlertInhibit rule : inhibitRules.values()) {
if (isSourceAlert(alert, rule)) {
cacheSourceAlert(alert, rule);
}
}
}

if (shouldInhibit(groupAlert)) {
log.debug("Alert {} is inhibited", groupAlert);
return;
}
// Filter out inhibited alerts
groupAlert.getAlerts().removeIf(this::shouldInhibit);

alarmSilenceReduce.silenceAlarm(groupAlert);
// Continue processing if there are remaining alerts
if (!groupAlert.getAlerts().isEmpty()) {
alarmSilenceReduce.silenceAlarm(groupAlert);
}
} catch (Exception e) {
log.error("Error inhibiting alarm for {}", groupAlert, e);
}
}

/**
* Check if alert matches inhibit rule source labels
* @param alert Grouped and pending alerts to be processed
* @param alert Single alert to be processed
* @param rule The rule of inhibition
*/
private boolean isSourceAlert(GroupAlert alert, AlertInhibit rule) {
private boolean isSourceAlert(SingleAlert alert, AlertInhibit rule) {
if (alert == null || rule == null) {
log.warn("Received null alert or rule in isSourceAlert");
return false;
}
if (!"firing".equals(alert.getStatus())) {
if (!CommonConstants.ALERT_STATUS_FIRING.equals(alert.getStatus())) {
return false;
}
return matchLabels(alert.getCommonLabels(), rule.getSourceLabels());
return matchLabels(alert.getLabels(), rule.getSourceLabels());
}

/**
* Check if alert should be inhibited by any active source alerts
* @param alert Grouped and pending alerts to be processed
* @param alert Single alert to be processed
*/
private boolean shouldInhibit(GroupAlert alert) {
private boolean shouldInhibit(SingleAlert alert) {
if (alert == null) {
log.warn("Received null alert in shouldInhibit");
return false;
}
if ("resolved".equals(alert.getStatus())) {
if (CommonConstants.ALERT_STATUS_RESOLVED.equals(alert.getStatus())) {
return false;
}

for (AlertInhibit rule : inhibitRules.values()) {
if (!matchLabels(alert.getCommonLabels(), rule.getTargetLabels())) {
if (!matchLabels(alert.getLabels(), rule.getTargetLabels())) {
continue;
}

List<GroupAlert> sourceAlerts = getActiveSourceAlerts(rule);
List<SingleAlert> sourceAlerts = getActiveSourceAlerts(rule);
if (sourceAlerts.isEmpty()) {
continue;
}

for (GroupAlert source : sourceAlerts) {
for (SingleAlert source : sourceAlerts) {
if (matchEqualLabels(source, alert, rule.getEqualLabels())) {
return true;
}
Expand All @@ -207,20 +224,20 @@ private boolean matchLabels(Map<String, String> alertLabels, Map<String, String>

/**
* Check if equal labels have same values in both alerts
* @param source Alarm used to suppress other alarms
* @param target Alarm that may be suppressed
* @param source Alert used to suppress other alerts
* @param target Alert that may be suppressed
* @param equalLabels Need to be equal labels
*/
private boolean matchEqualLabels(GroupAlert source, GroupAlert target, List<String> equalLabels) {
private boolean matchEqualLabels(SingleAlert source, SingleAlert target, List<String> equalLabels) {
if (source == null || target == null) {
log.warn("Received null source or target in matchEqualLabels");
return false;
}
if (equalLabels == null || equalLabels.isEmpty()) {
return true;
}
Map<String, String> sourceLabels = source.getCommonLabels();
Map<String, String> targetLabels = target.getCommonLabels();
Map<String, String> sourceLabels = source.getLabels();
Map<String, String> targetLabels = target.getLabels();

return equalLabels.stream().allMatch(label -> {
String sourceValue = sourceLabels.get(label);
Expand All @@ -231,10 +248,10 @@ private boolean matchEqualLabels(GroupAlert source, GroupAlert target, List<Stri

/**
* Cache source alert for inhibit rule
* @param alert Grouped and pending alerts to be processed
* @param alert Single alert to be processed
* @param rule The rule of inhibition
*/
private void cacheSourceAlert(GroupAlert alert, AlertInhibit rule) {
private void cacheSourceAlert(SingleAlert alert, AlertInhibit rule) {
if (alert == null || rule == null) {
log.warn("Received null alert or rule in cacheSourceAlert");
return;
Expand All @@ -243,22 +260,22 @@ private void cacheSourceAlert(GroupAlert alert, AlertInhibit rule) {
rule.getId(),
k -> new ConcurrentHashMap<>()
);

String fingerprint = generateAlertFingerprint(alert);

SourceAlertEntry entry = new SourceAlertEntry(
alert,
System.currentTimeMillis(),
System.currentTimeMillis() + SOURCE_ALERT_TTL
);
ruleCache.put(fingerprint, entry);
ruleCache.put(alert.getFingerprint(), entry);
cleanupExpiredEntries(ruleCache);
}

/**
* Get active source alerts for inhibit rule
* @param rule The rule of inhibition
* @return List of active source alerts
*/
private List<GroupAlert> getActiveSourceAlerts(AlertInhibit rule) {
private List<SingleAlert> getActiveSourceAlerts(AlertInhibit rule) {
if (rule == null) {
log.warn("Received null rule in getActiveSourceAlerts");
return Collections.emptyList();
Expand All @@ -275,24 +292,6 @@ private List<GroupAlert> getActiveSourceAlerts(AlertInhibit rule) {
.collect(Collectors.toList());
}

/**
* Generate fingerprint for alert deduplication
* @param alert Grouped and pending alerts to be processed
*/
private String generateAlertFingerprint(GroupAlert alert) {
if (alert == null) {
log.warn("Received null alert in generateAlertFingerprint");
return "";
}
Map<String, String> labels = new HashMap<>(alert.getCommonLabels());
labels.remove("timestamp");

return labels.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.map(e -> e.getKey() + ":" + e.getValue())
.collect(Collectors.joining(","));
}

/**
* Remove expired entries from cache
* @param cache Source alert cache entry map
Expand All @@ -312,7 +311,7 @@ private void cleanupExpiredEntries(Map<String, SourceAlertEntry> cache) {
@Data
@AllArgsConstructor
private static class SourceAlertEntry {
private final GroupAlert alert;
private final SingleAlert alert;
private final long createTime;
private final long expiryTime;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.apache.hertzbeat.alert.reduce;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
package org.apache.hertzbeat.alert.reduce;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hertzbeat.alert.reduce;




import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.argThat;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.hertzbeat.alert.dao.AlertGroupConvergeDao;
import org.apache.hertzbeat.common.entity.alerter.AlertGroupConverge;
import org.apache.hertzbeat.common.entity.alerter.SingleAlert;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;

/**
* Test for AlarmGroupReduce
*/
class AlarmGroupReduceTest {

@Mock
private AlarmInhibitReduce alarmInhibitReduce;

@Mock
private AlertGroupConvergeDao alertGroupConvergeDao;

private AlarmGroupReduce alarmGroupReduce;

@BeforeEach
void setUp() {
MockitoAnnotations.openMocks(this);
when(alertGroupConvergeDao.findAlertGroupConvergesByEnableIsTrue())
.thenReturn(Collections.emptyList());
alarmGroupReduce = new AlarmGroupReduce(alarmInhibitReduce, alertGroupConvergeDao);
}

@Test
void whenNoGroupRules_shouldSendSingleAlert() {
SingleAlert alert = SingleAlert.builder()
.fingerprint("fp1")
.status("firing")
.labels(createLabels("severity", "critical"))
.build();

alarmGroupReduce.processGroupAlert(alert);

verify(alarmInhibitReduce).inhibitAlarm(argThat(group ->
group.getAlerts().size() == 1 && group.getAlerts().get(0).getFingerprint().equals("fp1")));
}

@Test
void whenMatchingGroupRule_shouldGroup() {
// Setup group rule
AlertGroupConverge rule = new AlertGroupConverge();
rule.setName("test-rule");
rule.setGroupLabels(Arrays.asList("severity", "instance"));
when(alertGroupConvergeDao.findAlertGroupConvergesByEnableIsTrue())
.thenReturn(Collections.singletonList(rule));
alarmGroupReduce.refreshGroupDefines(Collections.singletonList(rule));

SingleAlert alert = SingleAlert.builder()
.fingerprint("fp1")
.status("firing")
.labels(createLabels("severity", "critical", "instance", "host1"))
.build();

alarmGroupReduce.processGroupAlert(alert);

// Verify group is created and cached (implicitly tested through internal state)
verify(alarmInhibitReduce, never()).inhibitAlarm(any()); // Should not send immediately due to group wait
}

private Map<String, String> createLabels(String... keyValues) {
Map<String, String> labels = new HashMap<>();
for (int i = 0; i < keyValues.length; i += 2) {
labels.put(keyValues[i], keyValues[i + 1]);
}
return labels;
}
}
Loading
Loading